diff options
Diffstat (limited to 'fs')
40 files changed, 497 insertions, 248 deletions
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 81dd075356b9..d4fb0afc0097 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); + mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); _debug("--- mount result %p ---", mnt); free_page((unsigned long) devname); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..5db6c8d745ea 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_real_cred()->uid; - wq->gid = current_real_cred()->gid; + wq->uid = current_cred()->uid; + wq->gid = current_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ef3ebd780aff..1e643c718917 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -363,6 +363,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) nr_pages = i; if (nr_pages > 0) { len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); break; } goto out_pages; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6a26c7bd1286..e3e1a80b351e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -1036,7 +1039,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -2423,7 +2425,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ec9dbbcca3b9..9156be545b0f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -245,7 +245,8 @@ compose_mount_options_err: * @fullpath: full path in UNC format * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, +static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, const char *fullpath, const struct dfs_info3_param *ref) { struct vfsmount *mnt; @@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; - mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata); + mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; @@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-EINVAL); break; } - mnt = cifs_dfs_do_refmount(cifs_sb, + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, referrals + i); cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 18a1e1d6671f..1cd0e2eefc66 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2884,7 +2884,15 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) for (i = 0; i < rdata->nr_pages; i++) { struct page *page = rdata->pages[i]; size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written = copy_page_to_iter(page, 0, copy, iter); + size_t written; + + if (unlikely(iter->type & ITER_PIPE)) { + void *addr = kmap_atomic(page); + + written = copy_to_iter(addr, copy, iter); + kunmap_atomic(addr); + } else + written = copy_page_to_iter(page, 0, copy, iter); remaining -= written; if (written < copy && iov_iter_count(iter) > 0) break; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f17fcf89e18e..1e30f74a9527 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = { static struct vfsmount *debugfs_automount(struct path *path) { - struct vfsmount *(*f)(void *); - f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(d_inode(path->dentry)->i_private); + debugfs_automount_t f; + f = (debugfs_automount_t)path->dentry->d_fsdata; + return f(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..9fbf92ca358c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5344,7 +5344,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5353,15 +5354,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); + stop = le32_to_cpu(extent->ee_block); /* * In case of left shift, Don't start shifting extents until we make * sure the hole is big enough to accommodate the shift. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, 0); + path = ext4_find_extent(inode, start - 1, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5393,9 +5394,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, else iterator = &stop; - /* Its safe to start updating extents */ - while (start < stop) { - path = ext4_find_extent(inode, *iterator, &path, 0); + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5422,8 +5428,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - *iterator = le32_to_cpu(extent->ee_block) > 0 ? - le32_to_cpu(extent->ee_block) - 1 : 0; + if (le32_to_cpu(extent->ee_block) > 0) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else + /* Beginning is reached, end of the loop */ + iterator = NULL; /* Update path extent in case we need to stop */ while (le32_to_cpu(extent->ee_block) < start) extent++; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d8ca4b9f9dd6..37b521ed39df 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -376,7 +376,7 @@ out: static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { - int ret, size; + int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) @@ -386,15 +386,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, if (size < len) return -ENOSPC; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); - up_write(&EXT4_I(inode)->xattr_sem); - + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -523,7 +522,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { - int ret, needed_blocks; + int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; @@ -563,7 +562,7 @@ retry: goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { @@ -600,7 +599,7 @@ retry: put_page(page); page = NULL; ext4_orphan_add(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; @@ -626,7 +625,7 @@ out: put_page(page); } if (sem_held) - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); @@ -719,7 +718,7 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -737,7 +736,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, goto out; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); @@ -747,7 +746,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); out: return copied; @@ -758,7 +757,7 @@ ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { - int ret; + int ret, no_expand; void *kaddr; struct ext4_iloc iloc; @@ -768,11 +767,11 @@ ext4_journalled_write_inline_data(struct inode *inode, return NULL; } - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } @@ -934,8 +933,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page) { int i_size_changed = 0; + int ret; - copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + ret = ext4_write_inline_data_end(inode, pos, len, copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ret; + } + copied = ret; /* * No need to use i_size_read() here, the i_size @@ -1249,7 +1255,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size; + int ret, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1257,7 +1263,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (ret) return ret; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; @@ -1303,7 +1309,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_mark_inode_dirty(handle, dir); - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); return ret; } @@ -1663,7 +1669,7 @@ int ext4_delete_inline_entry(handle_t *handle, struct buffer_head *bh, int *has_inline_data) { - int err, inline_size; + int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; @@ -1671,7 +1677,7 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) return err; - down_write(&EXT4_I(dir)->xattr_sem); + ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; @@ -1705,7 +1711,7 @@ int ext4_delete_inline_entry(handle_t *handle, ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: - up_write(&EXT4_I(dir)->xattr_sem); + ext4_write_unlock_xattr(dir, &no_expand); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); @@ -1804,11 +1810,11 @@ out: int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { - int ret; + int ret, no_expand; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return ret; } @@ -1893,7 +1899,7 @@ out: void ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; - int inline_size, value_len, needed_blocks; + int inline_size, value_len, needed_blocks, no_expand; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { @@ -1910,7 +1916,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) if (IS_ERR(handle)) return; - down_write(&EXT4_I(inode)->xattr_sem); + ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { *has_inline = 0; ext4_journal_stop(handle); @@ -1968,7 +1974,7 @@ out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); @@ -1984,7 +1990,7 @@ out: int ext4_convert_inline_data(struct inode *inode) { - int error, needed_blocks; + int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; @@ -2006,15 +2012,10 @@ int ext4_convert_inline_data(struct inode *inode) goto out_free; } - down_write(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - up_write(&EXT4_I(inode)->xattr_sem); - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); - up_write(&EXT4_I(inode)->xattr_sem); -out: + ext4_write_lock_xattr(inode, &no_expand); + if (ext4_has_inline_data(inode)) + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 33a509c876ee..dc9d64ac5969 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1324,8 +1324,11 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto errout; + } copied = ret; } else copied = block_write_end(file, mapping, pos, @@ -1379,7 +1382,9 @@ errout: * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_handle_dirty_metadata() instead. */ -static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct page *page, + unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; @@ -1396,7 +1401,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to) size = min(to, block_end) - start; zero_user(page, start, size); - set_buffer_uptodate(bh); + write_end_fn(handle, bh); } clear_buffer_new(bh); } @@ -1425,18 +1430,25 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else { - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - zero_new_buffers(page, from+copied, to); + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto errout; } - + copied = ret; + } else if (unlikely(copied < len) && !PageUptodate(page)) { + copied = 0; + ext4_journalled_zero_new_buffers(handle, page, from, to); + } else { + if (unlikely(copied < len)) + ext4_journalled_zero_new_buffers(handle, page, + from + copied, to); ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + from + copied, &partial, + write_end_fn); if (!partial) SetPageUptodate(page); } @@ -1462,6 +1474,7 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -3811,6 +3824,10 @@ static int ext4_block_truncate_page(handle_t *handle, unsigned blocksize; struct inode *inode = mapping->host; + /* If we are processing an encrypted inode during orphan list handling */ + if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode)) + return 0; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7ae43c59bc79..2e9fc7a61048 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3123,6 +3123,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; + /* + * Trim allocation request for filesystems with artificially small + * groups. + */ + if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); + end = start + size; /* check we don't cross already preallocated blocks */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..c4a389a6027b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1616,13 +1616,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi !fscrypt_has_permitted_context(dir, inode)) { int nokey = ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); - iput(inode); - if (nokey) + if (nokey) { + iput(inode); return ERR_PTR(-ENOKEY); + } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); + iput(inode); return ERR_PTR(-EPERM); } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bbc316db9495..5fa9ba1de429 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -825,6 +825,7 @@ static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + int aborted = 0; int i, err; ext4_unregister_li_request(sb); @@ -834,9 +835,10 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if (err < 0) + if ((err < 0) && !aborted) ext4_abort(sb, "Couldn't clean up the journal"); } @@ -847,7 +849,7 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!(sb->s_flags & MS_RDONLY) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } @@ -3828,7 +3830,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", @@ -3911,7 +3913,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - if (ext4_load_journal(sb, es, journal_devnum)) + err = ext4_load_journal(sb, es, journal_devnum); + if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && ext4_has_feature_journal_needs_recovery(sb)) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..4448ed37181b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1174,16 +1174,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; - unsigned long no_expand; + int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; - down_write(&EXT4_I(inode)->xattr_sem); - no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) @@ -1251,7 +1249,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1265,9 +1263,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); - if (no_expand == 0) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return error; } @@ -1484,12 +1480,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ + int no_expand; + + if (ext4_write_trylock_xattr(inode, &no_expand) == 0) + return 0; - down_write(&EXT4_I(inode)->xattr_sem); - /* - * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty - */ - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) @@ -1571,17 +1566,16 @@ shift: EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); + ext4_write_unlock_xattr(inode, &no_expand); return 0; cleanup: brelse(bh); /* - * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode - * size expansion failed. + * Inode size expansion failed; don't try again */ - up_write(&EXT4_I(inode)->xattr_sem); + no_expand = 1; + ext4_write_unlock_xattr(inode, &no_expand); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index a92e783fa057..099c8b670ef5 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -102,6 +102,38 @@ extern const struct xattr_handler ext4_xattr_security_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" +/* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. + * The first is to signal that there the inline xattrs and data are + * taking up so much space that we might as well not keep trying to + * expand it. The second is that xattr_sem is taken for writing, so + * we shouldn't try to recurse into the inode expansion. For this + * second case, we need to make sure that we take save and restore the + * NO_EXPAND state flag appropriately. + */ +static inline void ext4_write_lock_xattr(struct inode *inode, int *save) +{ + down_write(&EXT4_I(inode)->xattr_sem); + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); +} + +static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) +{ + if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) + return 0; + *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + return 1; +} + +static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) +{ + if (*save == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); +} + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 369f4513be37..ebdc90fc71b7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,9 +207,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + /* This is to increase the speed of f2fs_create */ + if (!de && room) { + F2FS_I(dir)->task = current; + if (F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } } return de; @@ -643,14 +647,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..7b32ce979fe1 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 506af456412f..3a1640be7ffc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -431,6 +431,7 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ @@ -833,6 +834,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f14ee923acd..34a69e7ed90b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -166,7 +166,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; @@ -199,6 +200,10 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -322,13 +327,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -972,5 +979,16 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count, blocks_per_sec; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; + + sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + (main_count - resv_count)); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..a7943f861d68 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -813,6 +813,8 @@ next: start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..b164f8339281 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -688,6 +688,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 338d2f73eb29..a2c05f2ada6d 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1359,6 +1359,16 @@ out: return 0; } +static void fat_dummy_inode_init(struct inode *inode) +{ + /* Initialize this dummy inode to work as no-op. */ + MSDOS_I(inode)->mmu_private = 0; + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->i_attrs = 0; + MSDOS_I(inode)->i_pos = 0; +} + static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - MSDOS_I(fat_inode)->i_pos = 0; + fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; + fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2401c5dabb2a..5ec5870e423a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { + __set_bit(FR_FORCE, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 14cbf60167a7..133f322573b5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -658,9 +658,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret, tries = 0; + rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (gl && !lockref_get_not_dead(&gl->gl_lockref)) gl = NULL; + rcu_read_unlock(); *glp = gl; if (gl) @@ -728,15 +730,18 @@ again: if (ret == -EEXIST) { ret = 0; + rcu_read_lock(); tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { if (++tries < 100) { + rcu_read_unlock(); cond_resched(); goto again; } tmp = NULL; ret = -ENOMEM; } + rcu_read_unlock(); } else { WARN_ON_ONCE(ret); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a6a3389a07fc..51519c2836b5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,7 +207,7 @@ struct lm_lockname { struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; -}; +} __packed __aligned(sizeof(int)); #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1652665bd93..5e659ee08d6a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1863,7 +1863,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) + if (transaction && is_journal_aborted(transaction->t_journal)) + clear_buffer_jbddirty(bh); + else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } diff --git a/fs/mount.h b/fs/mount.h index d2e25d7b64b3..d8295f273a2f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt) } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); diff --git a/fs/namei.c b/fs/namei.c index 9c8dd3c83a80..150fbdd8e04c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; - const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, if (nd->total_link_count >= 40) return -ELOOP; - old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); - revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate diff --git a/fs/namespace.c b/fs/namespace.c index cb15f5397991..843d274ba167 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -645,28 +645,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) } /* - * find the last mount at @dentry on vfsmount @mnt. - * mount_lock must be held. - */ -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) -{ - struct mount *p, *res = NULL; - p = __lookup_mnt(mnt, dentry); - if (!p) - goto out; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - hlist_for_each_entry_continue(p, mnt_hash) { - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) - break; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - } -out: - return res; -} - -/* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the @@ -886,6 +864,13 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + /* * vfsmount lock must be held for write */ @@ -894,28 +879,45 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + __attach_mnt(mnt, parent); } -static void attach_shadowed(struct mount *mnt, - struct mount *parent, - struct mount *shadows) +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { - if (shadows) { - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); - list_add(&mnt->mnt_child, &shadows->mnt_child); - } else { - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - } + struct mountpoint *old_mp = mnt->mnt_mp; + struct dentry *old_mountpoint = mnt->mnt_mountpoint; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + + /* + * Safely avoid even the suggestion this code might sleep or + * lock the mount hash by taking advantage of the knowledge that + * mnt_change_mountpoint will not release the final reference + * to a mountpoint. + * + * During mounting, the mount passed in as the parent mount will + * continue to use the old mountpoint and during unmounting, the + * old mountpoint will continue to exist until namespace_unlock, + * which happens well after mnt_change_mountpoint. + */ + spin_lock(&old_mountpoint->d_lock); + old_mountpoint->d_lockref.count--; + spin_unlock(&old_mountpoint->d_lock); + + mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt, struct mount *shadows) +static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -933,7 +935,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) n->mounts += n->pending_mounts; n->pending_mounts = 0; - attach_shadowed(mnt, parent, shadows); + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -997,6 +999,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void } EXPORT_SYMBOL_GPL(vfs_kern_mount); +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, MS_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -1741,7 +1758,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1763,14 +1779,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - mnt_set_mountpoint(parent, p->mnt_mp, q); - if (!list_empty(&parent->mnt_mounts)) { - t = list_last_entry(&parent->mnt_mounts, - struct mount, mnt_child); - if (t->mnt_mp != p->mnt_mp) - t = NULL; - } - attach_shadowed(q, parent, t); + attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } @@ -1971,10 +1980,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, { HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + /* Is there space to add these mounts to the mount namespace? */ if (!parent_path) { err = count_mounts(ns, source_mnt); @@ -2001,16 +2018,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt, NULL); + commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt_last(&child->mnt_parent->mnt, - child->mnt_mountpoint); - commit_tree(child, q); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + commit_tree(child); } + put_mountpoint(smp); unlock_mount_hash(); return 0; @@ -2025,6 +2045,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + return err; } @@ -2773,7 +2798,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME | MS_NOREMOTELOCK); + MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a5c38889e7ae..13abd608af0f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1073,9 +1073,6 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs_client *mds_client = mds_server->nfs_client; struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: @@ -1176,9 +1173,6 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); - if (task->tk_status >= 0) - return 0; - switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1213,6 +1207,13 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; + if (task->tk_status >= 0) + return 0; + + /* Handle the case of an invalid layout segment */ + if (!pnfs_is_valid_lseg(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, lseg, idx); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 5551e8ef67fd..e49d831c4e85 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { - return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); } /** diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 608501971fe0..5cda392028ce 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -128,30 +128,26 @@ out_unlock: return err; } -static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, +static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, - struct file *dst, loff_t pos_dst, + struct file *dst, struct nfs_lock_context *dst_lock, - size_t count) + struct nfs42_copy_args *args, + struct nfs42_copy_res *res) { - struct nfs42_copy_args args = { - .src_fh = NFS_FH(file_inode(src)), - .src_pos = pos_src, - .dst_fh = NFS_FH(file_inode(dst)), - .dst_pos = pos_dst, - .count = count, - }; - struct nfs42_copy_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], - .rpc_argp = &args, - .rpc_resp = &res, + .rpc_argp = args, + .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); struct nfs_server *server = NFS_SERVER(dst_inode); + loff_t pos_src = args->src_pos; + loff_t pos_dst = args->dst_pos; + size_t count = args->count; int status; - status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, src_lock, FMODE_READ); if (status) return status; @@ -161,7 +157,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; - status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; @@ -171,22 +167,22 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, return status; status = nfs4_call_sync(server->client, server, &msg, - &args.seq_args, &res.seq_res, 0); + &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) return status; - if (res.write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (res->write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res->write_res.verifier.verifier); if (status) return status; } truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res.write_res.count); + pos_dst + res->write_res.count); - return res.write_res.count; + return res->write_res.count; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -196,8 +192,22 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; struct nfs_lock_context *dst_lock; - struct nfs4_exception src_exception = { }; - struct nfs4_exception dst_exception = { }; + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct nfs4_exception src_exception = { + .inode = file_inode(src), + .stateid = &args.src_stateid, + }; + struct nfs4_exception dst_exception = { + .inode = file_inode(dst), + .stateid = &args.dst_stateid, + }; ssize_t err, err2; if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) @@ -207,7 +217,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (IS_ERR(src_lock)) return PTR_ERR(src_lock); - src_exception.inode = file_inode(src); src_exception.state = src_lock->open_context->state; dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); @@ -216,15 +225,17 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, goto out_put_src_lock; } - dst_exception.inode = file_inode(dst); dst_exception.state = dst_lock->open_context->state; do { inode_lock(file_inode(dst)); - err = _nfs42_proc_copy(src, pos_src, src_lock, - dst, pos_dst, dst_lock, count); + err = _nfs42_proc_copy(src, src_lock, + dst, dst_lock, + &args, &res); inode_unlock(file_inode(dst)); + if (err >= 0) + break; if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d21104912676..d8b040bd9814 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->hostname, mountdata->mnt_path); - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); if (!IS_ERR(mnt)) break; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3573653fd5cc..0a8bc7eab083 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2708,6 +2708,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = PTR_ERR(state); if (IS_ERR(state)) goto out; + ctx->state = state; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) @@ -2733,7 +2734,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - ctx->state = state; if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqretry(&sp->so_reclaim_seqlock, seq)) @@ -4990,7 +4990,7 @@ out: */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -5004,13 +5004,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; int ret = -ENOMEM, i; - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - if (npages == 0) - npages = 1; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -7430,11 +7426,11 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - nfs_put_client(cdata->args.client); if (cdata->xprt) { xprt_put(cdata->xprt); rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); } + nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); kfree(cdata->res.server_owner); @@ -7541,10 +7537,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_impl_id; - } + if (IS_ERR(task)) + return PTR_ERR(task); if (!xprt) { status = rpc_wait_for_completion_task(task); @@ -7572,6 +7566,7 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); + nfs_put_client(clp); goto out; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fc89e5ed07ee..c9c4d9855976 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2492,7 +2492,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ca642fe9b21..b829cc9a9b39 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index c48859f16e7b..67c24351a67f 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -115,6 +115,13 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) return &orangefs_inode->vfs_inode; } +static void orangefs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + kmem_cache_free(orangefs_inode_cache, orangefs_inode); +} + static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); @@ -123,7 +130,7 @@ static void orangefs_destroy_inode(struct inode *inode) "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); - kmem_cache_free(orangefs_inode_cache, orangefs_inode); + call_rcu(&inode->i_rcu, orangefs_i_callback); } /* diff --git a/fs/pnode.c b/fs/pnode.c index 234a9ac49958..b394ca5307ec 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -324,6 +324,21 @@ out: return ret; } +static struct mount *find_topper(struct mount *mnt) +{ + /* If there is exactly one mount covering mnt completely return it. */ + struct mount *child; + + if (!list_is_singular(&mnt->mnt_mounts)) + return NULL; + + child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); + if (child->mnt_mountpoint != mnt->mnt.mnt_root) + return NULL; + + return child; +} + /* * return true if the refcount is greater than count */ @@ -344,9 +359,8 @@ static inline int do_refcount_check(struct mount *mnt, int count) */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child; + struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - int ret = 0; if (mnt == parent) return do_refcount_check(mnt, refcnt); @@ -361,12 +375,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); - if (child && list_empty(&child->mnt_mounts) && - (ret = do_refcount_check(child, 1))) - break; + int count = 1; + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); + if (!child) + continue; + + /* Is there exactly one mount on the child that covers + * it completely whose reference should be ignored? + */ + topper = find_topper(child); + if (topper) + count += 1; + else if (!list_empty(&child->mnt_mounts)) + continue; + + if (do_refcount_check(child, count)) + return 1; } - return ret; + return 0; } /* @@ -383,7 +409,7 @@ void propagate_mount_unlock(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } @@ -401,9 +427,11 @@ static void mark_umount_candidates(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + if (!child || (child->mnt.mnt_flags & MNT_UMOUNT)) + continue; + if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) { SET_MNT_MARK(child); } } @@ -422,8 +450,8 @@ static void __propagate_umount(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - - struct mount *child = __lookup_mnt_last(&m->mnt, + struct mount *topper; + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); /* * umount the child only if the child has no children @@ -432,6 +460,15 @@ static void __propagate_umount(struct mount *mnt) if (!child || !IS_MNT_MARKED(child)) continue; CLEAR_MNT_MARK(child); + + /* If there is exactly one mount covering all of child + * replace child with that mount. + */ + topper = find_topper(child); + if (topper) + mnt_change_mountpoint(child->mnt_parent, child->mnt_mp, + topper); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); child->mnt.mnt_flags |= MNT_UMOUNT; diff --git a/fs/pnode.h b/fs/pnode.h index 550f5a8b4fcf..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, + struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/super.c b/fs/super.c index c183835566c1..1058bf3e8724 100644 --- a/fs/super.c +++ b/fs/super.c @@ -470,7 +470,7 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; - if (!(flags & MS_KERNMOUNT) && + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -500,7 +500,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags, user_ns); + s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -541,8 +541,15 @@ struct super_block *sget(struct file_system_type *type, { struct user_namespace *user_ns = current_user_ns(); + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & MS_SUBMOUNT) + user_ns = &init_user_ns; + /* Ensure the requestor has permissions over the target filesystem */ - if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return sget_userns(type, test, set, flags, user_ns, data); |