From d6ddc9cf431b6887cf57984a152dd537d54aa35b Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 14 Nov 2016 21:04:37 -0500 Subject: ext4: fix mballoc breakage with 64k block size [ Upstream commit 69e43e8cc971a79dd1ee5d4343d8e63f82725123 ] 'border' variable is set to a value of 2 times the block size of the underlying filesystem. With 64k block size, the resulting value won't fit into a 16-bit variable. Hence this commit changes the data type of 'border' to 'unsigned int'. Fixes: c9de560ded61f Signed-off-by: Chandan Rajendra Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b58e266892b..ad2fa476ce59 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -668,7 +668,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); -- cgit v1.2.3 From 9d45d2e798000a0c622e96fc486e0ab8fb1f933d Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 14 Nov 2016 21:26:26 -0500 Subject: ext4: fix stack memory corruption with 64k block size [ Upstream commit 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 ] The number of 'counters' elements needed in 'struct sg' is super_block->s_blocksize_bits + 2. Presently we have 16 'counters' elements in the array. This is insufficient for block sizes >= 32k. In such cases the memcpy operation performed in ext4_mb_seq_groups_show() would cause stack memory corruption. Fixes: c9de560ded61f Signed-off-by: Chandan Rajendra Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ad2fa476ce59..e6798ca34928 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2254,7 +2254,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; -- cgit v1.2.3 From 526cfedac0b3e2a4fd19bd43f34be2a52e002234 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:00:24 -0500 Subject: ext4: sanity check the block and cluster size at mount time [ Upstream commit 9e47a4c9fc58032ee135bf76516809c7624b1551 ] If the block size or cluster size is insane, reject the mount. This is important for security reasons (although we shouldn't be just depending on this check). Ref: http://www.securityfocus.com/archive/1/539661 Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1332506 Reported-by: Borislav Petkov Reported-by: Nikolay Borisov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 145d6ba4117d..df67a6f8582a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -224,6 +224,7 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4723d8b02747..04e6abf707f7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3736,7 +3736,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d", blocksize); + "Unsupported filesystem blocksize %d (%d log_block_size)", + blocksize, le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); goto failed_mount; } @@ -3878,6 +3886,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "block size (%d)", clustersize, blocksize); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = -- cgit v1.2.3 From 31eb998aa42dd8fa96bea51320f21eb005ad752d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:28:30 -0500 Subject: ext4: use more strict checks for inodes_per_block on mount [ Upstream commit cd6bb35bf7f6d7d922509bf50265383a0ceabe96 ] Centralize the checks for inodes_per_block and be more strict to make sure the inodes_per_block_group can't end up being zero. Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/super.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04e6abf707f7..c34e6a93854b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3840,12 +3840,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3929,13 +3933,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); -- cgit v1.2.3 From b1a75800973969f7bdbfd53f6cbfdad5fdd76fb3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Nov 2016 13:37:47 -0500 Subject: ext4: add sanity checking to count_overhead() [ Upstream commit c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 ] The commit "ext4: sanity check the block and cluster size at mount time" should prevent any problems, but in case the superblock is modified while the file system is mounted, add an extra safety check to make sure we won't overrun the allocated buffer. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c34e6a93854b..97aa8be40175 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3361,10 +3361,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; -- cgit v1.2.3 From fee2ddaee7757fd8c62bbf82ce0f66df71a6d4a6 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Sun, 20 Nov 2016 19:57:23 +0100 Subject: f2fs: set ->owner for debugfs status file's file_operations [ Upstream commit 05e6ea2685c964db1e675a24a4f4e2adc22d2388 ] The struct file_operations instance serving the f2fs/status debugfs file lacks an initialization of its ->owner. This means that although that file might have been opened, the f2fs module can still get removed. Any further operation on that opened file, releasing included, will cause accesses to unmapped memory. Indeed, Mike Marshall reported the following: BUG: unable to handle kernel paging request at ffffffffa0307430 IP: [] full_proxy_release+0x24/0x90 <...> Call Trace: [] __fput+0xdf/0x1d0 [] ____fput+0xe/0x10 [] task_work_run+0x8e/0xc0 [] do_exit+0x2ae/0xae0 [] ? __audit_syscall_entry+0xae/0x100 [] ? syscall_trace_enter+0x1ca/0x310 [] do_group_exit+0x44/0xc0 [] SyS_exit_group+0x14/0x20 [] do_syscall_64+0x61/0x150 [] entry_SYSCALL64_slow_path+0x25/0x25 <...> ---[ end trace f22ae883fa3ea6b8 ]--- Fixing recursive fault but reboot is needed! Fix this by initializing the f2fs/status file_operations' ->owner with THIS_MODULE. This will allow debugfs to grab a reference to the f2fs module upon any open on that file, thus preventing it from getting removed. Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs") Reported-by: Mike Marshall Reported-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f5388f37217e..bb73d0a0f387 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -339,6 +339,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, -- cgit v1.2.3 From 575b6eddbf739a02bdaea80557cf0e203ef1b889 Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Fri, 28 Oct 2016 10:48:26 +0800 Subject: Btrfs: fix deadlock caused by fsync when logging directory entries [ Upstream commit ec125cfb7ae2157af3dd45dd8abe823e3e233eec ] While logging new directory entries, at tree-log.c:log_new_dir_dentries(), after we call btrfs_search_forward() we get a leaf with a read lock on it, and without unlocking that leaf we can end up calling btrfs_iget() to get an inode pointer. The later (btrfs_iget()) can end up doing a read-only search on the same tree again, if the inode is not in memory already, which ends up causing a deadlock if some other task in the meanwhile started a write search on the tree and is attempting to write lock the same leaf that btrfs_search_forward() locked while holding write locks on upper levels of the tree blocking the read search from btrfs_iget(). In this scenario we get a deadlock. So fix this by releasing the search path before calling btrfs_iget() at tree-log.c:log_new_dir_dentries(). Example trace of such deadlock: [ 4077.478852] kworker/u24:10 D ffff88107fc90640 0 14431 2 0x00000000 [ 4077.486752] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] [ 4077.494346] ffff880ffa56bad0 0000000000000046 0000000000009000 ffff880ffa56bfd8 [ 4077.502629] ffff880ffa56bfd8 ffff881016ce21c0 ffffffffa06ecb26 ffff88101a5d6138 [ 4077.510915] ffff880ebb5173b0 ffff880ffa56baf8 ffff880ebb517410 ffff881016ce21c0 [ 4077.519202] Call Trace: [ 4077.528752] [] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs] [ 4077.536049] [] ? wake_up_atomic_t+0x30/0x30 [ 4077.542574] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] [ 4077.550171] [] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs] [ 4077.558252] [] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs] [ 4077.566140] [] ? add_delayed_data_ref+0xe2/0x150 [btrfs] [ 4077.573928] [] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs] [ 4077.582399] [] ? __set_extent_bit+0x4c0/0x5c0 [btrfs] [ 4077.589896] [] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs] [ 4077.599632] [] ? start_transaction+0x8d/0x470 [btrfs] [ 4077.607134] [] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs] [ 4077.615329] [] ? process_one_work+0x142/0x3d0 [ 4077.622043] [] ? worker_thread+0x109/0x3b0 [ 4077.628459] [] ? manage_workers.isra.26+0x270/0x270 [ 4077.635759] [] ? kthread+0xaf/0xc0 [ 4077.641404] [] ? kthread_create_on_node+0x110/0x110 [ 4077.648696] [] ? ret_from_fork+0x58/0x90 [ 4077.654926] [] ? kthread_create_on_node+0x110/0x110 [ 4078.358087] kworker/u24:15 D ffff88107fcd0640 0 14436 2 0x00000000 [ 4078.365981] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] [ 4078.373574] ffff880ffa57fad0 0000000000000046 0000000000009000 ffff880ffa57ffd8 [ 4078.381864] ffff880ffa57ffd8 ffff88103004d0a0 ffffffffa06ecb26 ffff88101a5d6138 [ 4078.390163] ffff880fbeffc298 ffff880ffa57faf8 ffff880fbeffc2f8 ffff88103004d0a0 [ 4078.398466] Call Trace: [ 4078.408019] [] ? btrfs_tree_lock+0xdd/0x2f0 [btrfs] [ 4078.415322] [] ? wake_up_atomic_t+0x30/0x30 [ 4078.421844] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] [ 4078.429438] [] ? btrfs_lookup_file_extent+0x33/0x40 [btrfs] [ 4078.437518] [] ? __btrfs_drop_extents+0x13b/0xdf0 [btrfs] [ 4078.445404] [] ? add_delayed_data_ref+0xe2/0x150 [btrfs] [ 4078.453194] [] ? btrfs_add_delayed_data_ref+0x149/0x1d0 [btrfs] [ 4078.461663] [] ? __set_extent_bit+0x4c0/0x5c0 [btrfs] [ 4078.469161] [] ? insert_reserved_file_extent.constprop.75+0xa4/0x320 [btrfs] [ 4078.478893] [] ? start_transaction+0x8d/0x470 [btrfs] [ 4078.486388] [] ? btrfs_finish_ordered_io+0x2e7/0x600 [btrfs] [ 4078.494561] [] ? process_one_work+0x142/0x3d0 [ 4078.501278] [] ? pwq_activate_delayed_work+0x27/0x40 [ 4078.508673] [] ? worker_thread+0x109/0x3b0 [ 4078.515098] [] ? manage_workers.isra.26+0x270/0x270 [ 4078.522396] [] ? kthread+0xaf/0xc0 [ 4078.528032] [] ? kthread_create_on_node+0x110/0x110 [ 4078.535325] [] ? ret_from_fork+0x58/0x90 [ 4078.541552] [] ? kthread_create_on_node+0x110/0x110 [ 4079.355824] user-space-program D ffff88107fd30640 0 32020 1 0x00000000 [ 4079.363716] ffff880eae8eba10 0000000000000086 0000000000009000 ffff880eae8ebfd8 [ 4079.372003] ffff880eae8ebfd8 ffff881016c162c0 ffffffffa06ecb26 ffff88101a5d6138 [ 4079.380294] ffff880fbed4b4c8 ffff880eae8eba38 ffff880fbed4b528 ffff881016c162c0 [ 4079.388586] Call Trace: [ 4079.398134] [] ? btrfs_tree_lock+0x85/0x2f0 [btrfs] [ 4079.405431] [] ? wake_up_atomic_t+0x30/0x30 [ 4079.411955] [] ? btrfs_lock_root_node+0x2b/0x40 [btrfs] [ 4079.419644] [] ? btrfs_search_slot+0xa03/0xb10 [btrfs] [ 4079.427237] [] ? btrfs_buffer_uptodate+0x52/0x70 [btrfs] [ 4079.435041] [] ? generic_bin_search.constprop.38+0x80/0x190 [btrfs] [ 4079.443897] [] ? btrfs_insert_empty_items+0x74/0xd0 [btrfs] [ 4079.451975] [] ? copy_items+0x128/0x850 [btrfs] [ 4079.458890] [] ? btrfs_log_inode+0x629/0xbf3 [btrfs] [ 4079.466292] [] ? btrfs_log_inode_parent+0xc61/0xf30 [btrfs] [ 4079.474373] [] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs] [ 4079.482161] [] ? btrfs_sync_file+0x20d/0x330 [btrfs] [ 4079.489558] [] ? do_fsync+0x4c/0x80 [ 4079.495300] [] ? SyS_fdatasync+0xa/0x10 [ 4079.501422] [] ? system_call_fastpath+0x16/0x1b [ 4079.508334] user-space-program D ffff88107fc30640 0 32021 1 0x00000004 [ 4079.516226] ffff880eae8efbf8 0000000000000086 0000000000009000 ffff880eae8effd8 [ 4079.524513] ffff880eae8effd8 ffff881030279610 ffffffffa06ecb26 ffff88101a5d6138 [ 4079.532802] ffff880ebb671d88 ffff880eae8efc20 ffff880ebb671de8 ffff881030279610 [ 4079.541092] Call Trace: [ 4079.550642] [] ? btrfs_tree_lock+0x85/0x2f0 [btrfs] [ 4079.557941] [] ? wake_up_atomic_t+0x30/0x30 [ 4079.564463] [] ? btrfs_search_slot+0x79f/0xb10 [btrfs] [ 4079.572058] [] ? btrfs_truncate_inode_items+0x168/0xb90 [btrfs] [ 4079.580526] [] ? join_transaction.isra.15+0x1e/0x3a0 [btrfs] [ 4079.588701] [] ? start_transaction+0x8d/0x470 [btrfs] [ 4079.596196] [] ? block_rsv_add_bytes+0x16/0x50 [btrfs] [ 4079.603789] [] ? btrfs_truncate+0xe9/0x2e0 [btrfs] [ 4079.610994] [] ? btrfs_setattr+0x30b/0x410 [btrfs] [ 4079.618197] [] ? notify_change+0x1dc/0x680 [ 4079.624625] [] ? aa_path_perm+0xd4/0x160 [ 4079.630854] [] ? do_truncate+0x5b/0x90 [ 4079.636889] [] ? do_sys_ftruncate.constprop.15+0x10a/0x160 [ 4079.644869] [] ? SyS_fcntl+0x5b/0x570 [ 4079.650805] [] ? system_call_fastpath+0x16/0x1b [ 4080.410607] user-space-program D ffff88107fc70640 0 32028 12639 0x00000004 [ 4080.418489] ffff880eaeccbbe0 0000000000000086 0000000000009000 ffff880eaeccbfd8 [ 4080.426778] ffff880eaeccbfd8 ffff880f317ef1e0 ffffffffa06ecb26 ffff88101a5d6138 [ 4080.435067] ffff880ef7e93928 ffff880f317ef1e0 ffff880eaeccbc08 ffff880f317ef1e0 [ 4080.443353] Call Trace: [ 4080.452920] [] ? btrfs_tree_read_lock+0xdd/0x190 [btrfs] [ 4080.460703] [] ? wake_up_atomic_t+0x30/0x30 [ 4080.467225] [] ? btrfs_read_lock_root_node+0x2b/0x40 [btrfs] [ 4080.475400] [] ? btrfs_search_slot+0x801/0xb10 [btrfs] [ 4080.482994] [] ? btrfs_clean_one_deleted_snapshot+0xe0/0xe0 [btrfs] [ 4080.491857] [] ? btrfs_lookup_inode+0x26/0x90 [btrfs] [ 4080.499353] [] ? kmem_cache_alloc+0xaf/0xc0 [ 4080.505879] [] ? btrfs_iget+0xd5/0x5d0 [btrfs] [ 4080.512696] [] ? btrfs_get_token_64+0x104/0x120 [btrfs] [ 4080.520387] [] ? btrfs_log_inode_parent+0xbdf/0xf30 [btrfs] [ 4080.528469] [] ? btrfs_log_dentry_safe+0x59/0x80 [btrfs] [ 4080.536258] [] ? btrfs_sync_file+0x20d/0x330 [btrfs] [ 4080.543657] [] ? do_fsync+0x4c/0x80 [ 4080.549399] [] ? SyS_fdatasync+0xa/0x10 [ 4080.555534] [] ? system_call_fastpath+0x16/0x1b Signed-off-by: Robbie Ko Reviewed-by: Filipe Manana Fixes: 2f2ff0ee5e43 (Btrfs: fix metadata inconsistencies after directory fsync) Cc: stable@vger.kernel.org # 4.1+ Signed-off-by: Filipe Manana [Modified changelog for clarity and correctness] Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 32ecb95f6214..3b3db0166777 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4984,6 +4984,7 @@ process_leaf: if (di_key.type == BTRFS_ROOT_ITEM_KEY) continue; + btrfs_release_path(path); di_inode = btrfs_iget(root->fs_info->sb, &di_key, root, NULL); if (IS_ERR(di_inode)) { @@ -4993,13 +4994,12 @@ process_leaf: if (btrfs_inode_in_log(di_inode, trans->transid)) { iput(di_inode); - continue; + break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, log_mode, 0, LLONG_MAX, ctx); iput(di_inode); -- cgit v1.2.3 From 90604ed2f10c9453814b8ca5a5b50243b5bf7c10 Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Fri, 7 Oct 2016 17:30:47 +0800 Subject: Btrfs: fix tree search logic when replaying directory entry deletes [ Upstream commit 2a7bf53f577e49c43de4ffa7776056de26db65d9 ] If a log tree has a layout like the following: leaf N: ... item 240 key (282 DIR_LOG_ITEM 0) itemoff 8189 itemsize 8 dir log end 1275809046 leaf N + 1: item 0 key (282 DIR_LOG_ITEM 3936149215) itemoff 16275 itemsize 8 dir log end 18446744073709551615 ... When we pass the value 1275809046 + 1 as the parameter start_ret to the function tree-log.c:find_dir_range() (done by replay_dir_deletes()), we end up with path->slots[0] having the value 239 (points to the last item of leaf N, item 240). Because the dir log item in that position has an offset value smaller than *start_ret (1275809046 + 1) we need to move on to the next leaf, however the logic for that is wrong since it compares the current slot to the number of items in the leaf, which is smaller and therefore we don't lookup for the next leaf but instead we set the slot to point to an item that does not exist, at slot 240, and we later operate on that slot which has unexpected content or in the worst case can result in an invalid memory access (accessing beyond the last page of leaf N's extent buffer). So fix the logic that checks when we need to lookup at the next leaf by first incrementing the slot and only after to check if that slot is beyond the last item of the current leaf. Signed-off-by: Robbie Ko Reviewed-by: Filipe Manana Fixes: e02119d5a7b4 (Btrfs: Add a write ahead tree log to optimize synchronous operations) Cc: stable@vger.kernel.org # 2.6.29+ Signed-off-by: Filipe Manana [Modified changelog for clarity and correctness] Signed-off-by: Sasha Levin --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3b3db0166777..6ee954c62fe6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1821,12 +1821,11 @@ static noinline int find_dir_range(struct btrfs_root *root, next: /* check the next slot in the tree to see if it is a valid item */ nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) goto out; - } else { - path->slots[0]++; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); -- cgit v1.2.3 From 73e7d7aef06c4f590c539844782d3720c7958efa Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 1 Dec 2016 09:18:28 +0100 Subject: block: protect iterate_bdevs() against concurrent close [ Upstream commit af309226db916e2c6e08d3eba3fa5c34225200c4 ] If a block device is closed while iterate_bdevs() is handling it, the following NULL pointer dereference occurs because bdev->b_disk is NULL in bdev_get_queue(), which is called from blk_get_backing_dev_info() (in turn called by the mapping_cap_writeback_dirty() call in __filemap_fdatawrite_range()): BUG: unable to handle kernel NULL pointer dereference at 0000000000000508 IP: [] blk_get_backing_dev_info+0x10/0x20 PGD 9e62067 PUD 9ee8067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 1 PID: 2422 Comm: sync Not tainted 4.5.0-rc7+ #400 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) task: ffff880009f4d700 ti: ffff880009f5c000 task.ti: ffff880009f5c000 RIP: 0010:[] [] blk_get_backing_dev_info+0x10/0x20 RSP: 0018:ffff880009f5fe68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88000ec17a38 RCX: ffffffff81a4e940 RDX: 7fffffffffffffff RSI: 0000000000000000 RDI: ffff88000ec176c0 RBP: ffff880009f5fe68 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88000ec17860 R13: ffffffff811b25c0 R14: ffff88000ec178e0 R15: ffff88000ec17a38 FS: 00007faee505d700(0000) GS:ffff88000fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000508 CR3: 0000000009e8a000 CR4: 00000000000006e0 Stack: ffff880009f5feb8 ffffffff8112e7f5 0000000000000000 7fffffffffffffff 0000000000000000 0000000000000000 7fffffffffffffff 0000000000000001 ffff88000ec178e0 ffff88000ec17860 ffff880009f5fec8 ffffffff8112e81f Call Trace: [] __filemap_fdatawrite_range+0x85/0x90 [] filemap_fdatawrite+0x1f/0x30 [] fdatawrite_one_bdev+0x16/0x20 [] iterate_bdevs+0xf2/0x130 [] sys_sync+0x63/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x76 Code: 0f 1f 44 00 00 48 8b 87 f0 00 00 00 55 48 89 e5 <48> 8b 80 08 05 00 00 5d RIP [] blk_get_backing_dev_info+0x10/0x20 RSP CR2: 0000000000000508 ---[ end trace 2487336ceb3de62d ]--- The crash is easily reproducible by running the following command, if an msleep(100) is inserted before the call to func() in iterate_devs(): while :; do head -c1 /dev/nullb0; done > /dev/null & while :; do sync; done Fix it by holding the bd_mutex across the func() call and only calling func() if the bdev is opened. Cc: stable@vger.kernel.org Fixes: 5c0d6b60a0ba ("vfs: Create function for iterating over block devices") Reported-and-tested-by: Wei Fang Signed-off-by: Rabin Vincent Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/block_dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index ccfd31f1df3a..67a2acac9752 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1772,6 +1772,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || @@ -1792,8 +1793,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) */ iput(old_inode); old_inode = inode; + bdev = I_BDEV(inode); - func(I_BDEV(inode), arg); + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers) + func(bdev, arg); + mutex_unlock(&bdev->bd_mutex); spin_lock(&inode_sb_list_lock); } -- cgit v1.2.3 From 93f5b2b5d28b7610d5255475c4e22ff90b6cc9c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:06 +1100 Subject: xfs: set AGI buffer type in xlog_recover_clear_agi_bucket [ Upstream commit 6b10b23ca94451fae153a5cc8d62fd721bec2019 ] xlog_recover_clear_agi_bucket didn't set the type to XFS_BLFT_AGI_BUF, so we got a warning during log replay (or an ASSERT on a debug build). XFS (md0): Unknown buffer type 0! XFS (md0): _xfs_buf_ioapply: no ops on block 0xaea8802/0x1 Fix this, as was done in f19b872b for 2 other locations with the same problem. cc: # 3.10 to current Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Sasha Levin --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1114afdd5a6b..4aefff89949d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3864,6 +3864,7 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); -- cgit v1.2.3 From 7f16769173dfa6c3f616602d633452eda4b0c1fc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 29 Nov 2016 11:30:58 -0800 Subject: CIFS: Fix missing nls unload in smb2_reconnect() [ Upstream commit 4772c79599564bd08ee6682715a7d3516f67433f ] Cc: Stable Acked-by: Sachin Prabhu Signed-off-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8f527c867f78..0582b178f64c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -264,7 +264,7 @@ out: case SMB2_CHANGE_NOTIFY: case SMB2_QUERY_INFO: case SMB2_SET_INFO: - return -EAGAIN; + rc = -EAGAIN; } unload_nls(nls_codepage); return rc; -- cgit v1.2.3 From 51fce08b64ec4d8cb2cc91eb08a84caf29dcd272 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 29 Nov 2016 16:14:43 -0800 Subject: CIFS: Fix a possible memory corruption in push locks [ Upstream commit e3d240e9d505fc67f8f8735836df97a794bbd946 ] If maxBuf is not 0 but less than a size of SMB2 lock structure we can end up with a memory corruption. Cc: Stable Signed-off-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/smb2file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 2ab297dae5a7..1bdfd7c5309c 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -241,7 +241,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) * and check it for zero before using. */ max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < sizeof(struct smb2_lock_element)) { free_xid(xid); return -EINVAL; } -- cgit v1.2.3 From 15a12fbbf365a483b1c19f9caeb707b3bea77e10 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 4 Nov 2016 11:50:31 -0700 Subject: CIFS: Fix a possible memory corruption during reconnect [ Upstream commit 53e0e11efe9289535b060a51d4cf37c25e0d0f2b ] We can not unlock/lock cifs_tcp_ses_lock while walking through ses and tcon lists because it can corrupt list iterator pointers and a tcon structure can be released if we don't hold an extra reference. Fix it by moving a reconnect process to a separate delayed work and acquiring a reference to every tcon that needs to be reconnected. Also do not send an echo request on newly established connections. CC: Stable Signed-off-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/cifsglob.h | 3 +++ fs/cifs/cifsproto.h | 3 +++ fs/cifs/connect.c | 34 +++++++++++++++++++----- fs/cifs/smb2pdu.c | 75 ++++++++++++++++++++++++++++++++++++----------------- fs/cifs/smb2proto.h | 1 + 5 files changed, 85 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 22b289a3b1c4..9d0e4fef8ee1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -615,6 +615,8 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + struct delayed_work reconnect; /* reconnect workqueue job */ + struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ #endif /* CONFIG_CIFS_SMB2 */ }; @@ -814,6 +816,7 @@ cap_unix(struct cifs_ses *ses) struct cifs_tcon { struct list_head tcon_list; int tc_count; + struct list_head rlist; /* reconnect list */ struct list_head openFileList; struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f730c065df34..5ee60b50d8eb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -207,6 +207,9 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); +extern void cifs_put_tcp_session(struct TCP_Server_Info *server, + int from_reconnect); +extern void cifs_put_tcon(struct cifs_tcon *tcon); #if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) extern void cifs_dfs_release_automount_timer(void); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index cd9d50e4f5f4..7d7bd466520b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -52,6 +52,9 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "fscache.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2proto.h" +#endif #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -2069,8 +2072,8 @@ cifs_find_tcp_session(struct smb_vol *vol) return NULL; } -static void -cifs_put_tcp_session(struct TCP_Server_Info *server) +void +cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) { struct task_struct *task; @@ -2087,6 +2090,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) cancel_delayed_work_sync(&server->echo); +#ifdef CONFIG_CIFS_SMB2 + if (from_reconnect) + /* + * Avoid deadlock here: reconnect work calls + * cifs_put_tcp_session() at its end. Need to be sure + * that reconnect work does nothing with server pointer after + * that step. + */ + cancel_delayed_work(&server->reconnect); + else + cancel_delayed_work_sync(&server->reconnect); +#endif + spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); @@ -2151,6 +2167,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); +#ifdef CONFIG_CIFS_SMB2 + INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); + mutex_init(&tcp_ses->reconnect_mutex); +#endif memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, @@ -2303,7 +2323,7 @@ cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); sesInfoFree(ses); - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); } #ifdef CONFIG_KEYS @@ -2476,7 +2496,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) mutex_unlock(&ses->session_mutex); /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); free_xid(xid); return ses; } @@ -2566,7 +2586,7 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc) return NULL; } -static void +void cifs_put_tcon(struct cifs_tcon *tcon) { unsigned int xid; @@ -3673,7 +3693,7 @@ mount_fail_check: else if (ses) cifs_put_smb_ses(ses); else - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); bdi_destroy(&cifs_sb->bdi); } @@ -3984,7 +4004,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info); if (IS_ERR(ses)) { tcon = (struct cifs_tcon *)ses; - cifs_put_tcp_session(master_tcon->ses->server); + cifs_put_tcp_session(master_tcon->ses->server, 0); goto out; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0582b178f64c..e3cf4a5fb35a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1615,6 +1615,54 @@ smb2_echo_callback(struct mid_q_entry *mid) add_credits(server, credits_received, CIFS_ECHO_OP); } +void smb2_reconnect_server(struct work_struct *work) +{ + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, reconnect.work); + struct cifs_ses *ses; + struct cifs_tcon *tcon, *tcon2; + struct list_head tmp_list; + int tcon_exist = false; + + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ + mutex_lock(&server->reconnect_mutex); + + INIT_LIST_HEAD(&tmp_list); + cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->need_reconnect) { + tcon->tc_count++; + list_add_tail(&tcon->rlist, &tmp_list); + tcon_exist = true; + } + } + } + /* + * Get the reference to server struct to be sure that the last call of + * cifs_put_tcon() in the loop below won't release the server pointer. + */ + if (tcon_exist) + server->srv_count++; + + spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { + smb2_reconnect(SMB2_ECHO, tcon); + list_del_init(&tcon->rlist); + cifs_put_tcon(tcon); + } + + cifs_dbg(FYI, "Reconnecting tcons finished\n"); + mutex_unlock(&server->reconnect_mutex); + + /* now we can safely release srv struct */ + if (tcon_exist) + cifs_put_tcp_session(server, 1); +} + int SMB2_echo(struct TCP_Server_Info *server) { @@ -1627,32 +1675,11 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request\n"); if (server->tcpStatus == CifsNeedNegotiate) { - struct list_head *tmp, *tmp2; - struct cifs_ses *ses; - struct cifs_tcon *tcon; - - cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); - spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - list_for_each(tmp2, &ses->tcon_list) { - tcon = list_entry(tmp2, struct cifs_tcon, - tcon_list); - /* add check for persistent handle reconnect */ - if (tcon && tcon->need_reconnect) { - spin_unlock(&cifs_tcp_ses_lock); - rc = smb2_reconnect(SMB2_ECHO, tcon); - spin_lock(&cifs_tcp_ses_lock); - } - } - } - spin_unlock(&cifs_tcp_ses_lock); + /* No need to send echo on newly established connections */ + queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + return rc; } - /* if no session, renegotiate failed above */ - if (server->tcpStatus == CifsNeedNegotiate) - return -EIO; - rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) return rc; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 9bc59f9c12fb..0a406ae78129 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -95,6 +95,7 @@ extern int smb2_open_file(const unsigned int xid, extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void smb2_reconnect_server(struct work_struct *work); /* * SMB2 Worker functions - most of protocol specific implementation details -- cgit v1.2.3 From 576cfe615febe0eccc95f7884ef998d15024b025 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2016 21:42:32 -0400 Subject: nfs_write_end(): fix handling of short copies [ Upstream commit c0cf3ef5e0f47e385920450b245d22bead93e7ad ] What matters when deciding if we should make a page uptodate is not how much we _wanted_ to copy, but how much we actually have copied. As it is, on architectures that do not zero tail on short copy we can leave uninitialized data in page marked uptodate. Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8b8d83a526ce..ddf5f9fd719f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -414,7 +414,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, */ if (!PageUptodate(page)) { unsigned pglen = nfs_page_length(page); - unsigned end = offset + len; + unsigned end = offset + copied; if (pglen == 0) { zero_user_segments(page, 0, offset, -- cgit v1.2.3 From 14927595fec915a90a050a1f43c20ca6a4115785 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 10 Dec 2016 09:56:01 -0500 Subject: ext4: return -ENOMEM instead of success [ Upstream commit 578620f451f836389424833f1454eeeb2ffc9e9f ] We should set the error code if kzalloc() fails. Fixes: 67cf5b09a46f ("ext4: add the basic function for inline data support") Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- fs/ext4/inline.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 095c7a258d97..d77d542c2ed5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -336,8 +336,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); -- cgit v1.2.3 From 25e9e23674573b1e52c4b7e663af0ac6716ffaff Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 12 Dec 2016 14:32:44 -0800 Subject: btrfs: limit async_work allocation and worker func duration [ Upstream commit 2939e1a86f758b55cdba73e29397dd3d94df13bc ] Problem statement: unprivileged user who has read-write access to more than one btrfs subvolume may easily consume all kernel memory (eventually triggering oom-killer). Reproducer (./mkrmdir below essentially loops over mkdir/rmdir): [root@kteam1 ~]# cat prep.sh DEV=/dev/sdb mkfs.btrfs -f $DEV mount $DEV /mnt for i in `seq 1 16` do mkdir /mnt/$i btrfs subvolume create /mnt/SV_$i ID=`btrfs subvolume list /mnt |grep "SV_$i$" |cut -d ' ' -f 2` mount -t btrfs -o subvolid=$ID $DEV /mnt/$i chmod a+rwx /mnt/$i done [root@kteam1 ~]# sh prep.sh [maxim@kteam1 ~]$ for i in `seq 1 16`; do ./mkrmdir /mnt/$i 2000 2000 & done [root@kteam1 ~]# for i in `seq 1 4`; do grep "kmalloc-128" /proc/slabinfo | grep -v dma; sleep 60; done kmalloc-128 10144 10144 128 32 1 : tunables 0 0 0 : slabdata 317 317 0 kmalloc-128 9992352 9992352 128 32 1 : tunables 0 0 0 : slabdata 312261 312261 0 kmalloc-128 24226752 24226752 128 32 1 : tunables 0 0 0 : slabdata 757086 757086 0 kmalloc-128 42754240 42754240 128 32 1 : tunables 0 0 0 : slabdata 1336070 1336070 0 The huge numbers above come from insane number of async_work-s allocated and queued by btrfs_wq_run_delayed_node. The problem is caused by btrfs_wq_run_delayed_node() queuing more and more works if the number of delayed items is above BTRFS_DELAYED_BACKGROUND. The worker func (btrfs_async_run_delayed_root) processes at least BTRFS_DELAYED_BATCH items (if they are present in the list). So, the machinery works as expected while the list is almost empty. As soon as it is getting bigger, worker func starts to process more than one item at a time, it takes longer, and the chances to have async_works queued more than needed is getting higher. The problem above is worsened by another flaw of delayed-inode implementation: if async_work was queued in a throttling branch (number of items >= BTRFS_DELAYED_WRITEBACK), corresponding worker func won't quit until the number of items < BTRFS_DELAYED_BACKGROUND / 2. So, it is possible that the func occupies CPU infinitely (up to 30sec in my experiments): while the func is trying to drain the list, the user activity may add more and more items to the list. The patch fixes both problems in straightforward way: refuse queuing too many works in btrfs_wq_run_delayed_node and bail out of worker func if at least BTRFS_DELAYED_WRITEBACK items are processed. Changed in v2: remove support of thresh == NO_THRESHOLD. Signed-off-by: Maxim Patlasov Signed-off-by: Chris Mason Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Sasha Levin --- fs/btrfs/async-thread.c | 14 ++++++++++++++ fs/btrfs/async-thread.h | 1 + fs/btrfs/delayed-inode.c | 6 ++++-- 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1848705506ff..0ce4de6430ef 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,20 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq) +{ + /* + * We could compare wq->normal->pending with num_online_cpus() + * to support "thresh == NO_THRESHOLD" case, but it requires + * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's + * postpone it until someone needs the support of that case. + */ + if (wq->normal->thresh == NO_THRESHOLD) + return false; + + return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ec2ee477f8ba..8c4564204f25 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -78,4 +78,5 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bc2d048a9eb9..8265b0754dca 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1371,7 +1371,8 @@ release_path: total_done++; btrfs_release_prepared_delayed_node(delayed_node); - if (async_work->nr == 0 || total_done < async_work->nr) + if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || + total_done < async_work->nr) goto again; free_path: @@ -1387,7 +1388,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return 0; async_work = kmalloc(sizeof(*async_work), GFP_NOFS); -- cgit v1.2.3 From 5eba61298b748b9f948a1de1055e84e620299d81 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 12 Dec 2016 08:21:51 -0700 Subject: block_dev: don't test bdev->bd_contains when it is not stable [ Upstream commit bcc7f5b4bee8e327689a4d994022765855c807ff ] bdev->bd_contains is not stable before calling __blkdev_get(). When __blkdev_get() is called on a parition with ->bd_openers == 0 it sets bdev->bd_contains = bdev; which is not correct for a partition. After a call to __blkdev_get() succeeds, ->bd_openers will be > 0 and then ->bd_contains is stable. When FMODE_EXCL is used, blkdev_get() calls bd_start_claiming() -> bd_prepare_to_claim() -> bd_may_claim() This call happens before __blkdev_get() is called, so ->bd_contains is not stable. So bd_may_claim() cannot safely use ->bd_contains. It currently tries to use it, and this can lead to a BUG_ON(). This happens when a whole device is already open with a bd_holder (in use by dm in my particular example) and two threads race to open a partition of that device for the first time, one opening with O_EXCL and one without. The thread that doesn't use O_EXCL gets through blkdev_get() to __blkdev_get(), gains the ->bd_mutex, and sets bdev->bd_contains = bdev; Immediately thereafter the other thread, using FMODE_EXCL, calls bd_start_claiming() from blkdev_get(). This should fail because the whole device has a holder, but because bdev->bd_contains == bdev bd_may_claim() incorrectly reports success. This thread continues and blocks on bd_mutex. The first thread then sets bdev->bd_contains correctly and drops the mutex. The thread using FMODE_EXCL then continues and when it calls bd_may_claim() again in: BUG_ON(!bd_may_claim(bdev, whole, holder)); The BUG_ON fires. Fix this by removing the dependency on ->bd_contains in bd_may_claim(). As bd_may_claim() has direct access to the whole device, it can simply test if the target bdev is the whole device. Fixes: 6b4517a7913a ("block: implement bd_claiming and claiming block") Cc: stable@vger.kernel.org (v2.6.35+) Signed-off-by: NeilBrown Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 67a2acac9752..300566ea9b74 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -729,7 +729,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ - else if (bdev->bd_contains == bdev) + else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) -- cgit v1.2.3 From 363f1a90b7f3057efa6e08bf7b707874232f153c Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 21 Dec 2016 16:26:24 +1100 Subject: fs: exec: apply CLOEXEC before changing dumpable task flags [ Upstream commit 613cc2b6f272c1a8ad33aefa21cad77af23139f7 ] If you have a process that has set itself to be non-dumpable, and it then undergoes exec(2), any CLOEXEC file descriptors it has open are "exposed" during a race window between the dumpable flags of the process being reset for exec(2) and CLOEXEC being applied to the file descriptors. This can be exploited by a process by attempting to access /proc//fd/... during this window, without requiring CAP_SYS_PTRACE. The race in question is after set_dumpable has been (for get_link, though the trace is basically the same for readlink): [vfs] -> proc_pid_link_inode_operations.get_link -> proc_pid_get_link -> proc_fd_access_allowed -> ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); Which will return 0, during the race window and CLOEXEC file descriptors will still be open during this window because do_close_on_exec has not been called yet. As a result, the ordering of these calls should be reversed to avoid this race window. This is of particular concern to container runtimes, where joining a PID namespace with file descriptors referring to the host filesystem can result in security issues (since PRCTL_SET_DUMPABLE doesn't protect against access of CLOEXEC file descriptors -- file descriptors which may reference filesystem objects the container shouldn't have access to). Cc: dev@opencontainers.org Cc: # v3.2+ Reported-by: Michael Crosby Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/exec.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1977c2a553ac..d392c8ad0de0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include @@ -1108,6 +1108,13 @@ int flush_old_exec(struct linux_binprm * bprm) flush_thread(); current->personality &= ~bprm->per_clear; + /* + * We have to apply CLOEXEC before we change whether the process is + * dumpable (in setup_new_exec) to avoid a race with a process in userspace + * trying to access the should-be-closed file descriptors of a process + * undergoing exec(2). + */ + do_close_on_exec(current->files); return 0; out: @@ -1157,7 +1164,6 @@ void setup_new_exec(struct linux_binprm * bprm) group */ current->self_exec_id++; flush_signal_handlers(current, 0); - do_close_on_exec(current->files); } EXPORT_SYMBOL(setup_new_exec); -- cgit v1.2.3