From 867e1162068eb5632c829d453fd65d6089564f55 Mon Sep 17 00:00:00 2001 From: Emil Goode Date: Thu, 9 May 2013 22:39:26 +0200 Subject: bcache: Fix incompatible pointer type warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function pointer release in struct block_device_operations should point to functions declared as void. Sparse warnings: drivers/md/bcache/super.c:656:27: warning: incorrect type in initializer (different base types) drivers/md/bcache/super.c:656:27: expected void ( *release )( ... ) drivers/md/bcache/super.c:656:27: got int ( static [toplevel] * )( ... ) drivers/md/bcache/super.c:656:2: warning: initialization from incompatible pointer type [enabled by default] drivers/md/bcache/super.c:656:2: warning: (near initialization for ‘bcache_ops.release’) [enabled by default] Signed-off-by: Emil Goode Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c8046bc4aa5..b09beb2b52c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -634,11 +634,10 @@ static int open_dev(struct block_device *b, fmode_t mode) return 0; } -static int release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b, fmode_t mode) { struct bcache_device *d = b->private_data; closure_put(&d->cl); - return 0; } static int ioctl_dev(struct block_device *b, fmode_t mode, -- cgit v1.2.3 From bbb1c3b5ae6c6286a5e018786be88f126d769543 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 13 May 2013 10:35:21 +0200 Subject: bcache: drop "select CLOSURES" The Kconfig entry for BCACHE selects CLOSURES. But there's no Kconfig symbol CLOSURES. That symbol was used in development versions of bcache, but was removed when the closures code was no longer provided as a kernel library. It can safely be dropped. Signed-off-by: Paul Bolle --- drivers/md/bcache/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 05c220d05e2..f950c9d29f3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,6 @@ config BCACHE tristate "Block device as cache" - select CLOSURES ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. -- cgit v1.2.3 From f59fce847fc8483508b5028c24e2b1e00523dd88 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 May 2013 00:11:26 -0700 Subject: bcache: Fix error handling in init code This code appears to have rotted... fix various bugs and do some refactoring. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/stats.c | 34 ++++---- drivers/md/bcache/super.c | 182 +++++++++++++++++++----------------------- drivers/md/bcache/writeback.c | 2 +- 4 files changed, 99 insertions(+), 121 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 340146d7c17..d3e15b42a4a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1241,7 +1241,7 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_writeback_init_cached_dev(struct cached_dev *); +void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); void bch_cache_allocator_exit(struct cache *ca); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 64e679449c2..b8730e714d6 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -93,24 +93,6 @@ static struct attribute *bch_stats_files[] = { }; static KTYPE(bch_stats); -static void scale_accounting(unsigned long data); - -void bch_cache_accounting_init(struct cache_accounting *acc, - struct closure *parent) -{ - kobject_init(&acc->total.kobj, &bch_stats_ktype); - kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); - kobject_init(&acc->hour.kobj, &bch_stats_ktype); - kobject_init(&acc->day.kobj, &bch_stats_ktype); - - closure_init(&acc->cl, parent); - init_timer(&acc->timer); - acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; - add_timer(&acc->timer); -} - int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, struct kobject *parent) { @@ -244,3 +226,19 @@ void bch_mark_sectors_bypassed(struct search *s, int sectors) atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); } + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + init_timer(&acc->timer); + acc->timer.expires = jiffies + accounting_delay; + acc->timer.data = (unsigned long) acc; + acc->timer.function = scale_accounting; + add_timer(&acc->timer); +} diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b09beb2b52c..f88e2b653a3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -731,8 +731,7 @@ static void bcache_device_free(struct bcache_device *d) if (d->c) bcache_device_detach(d); - - if (d->disk) + if (d->disk && d->disk->flags & GENHD_FL_UP) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); @@ -755,12 +754,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || - bio_split_pool_init(&d->bio_split_hook)) - - return -ENOMEM; - - d->disk = alloc_disk(1); - if (!d->disk) + bio_split_pool_init(&d->bio_split_hook) || + !(d->disk = alloc_disk(1)) || + !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); @@ -770,10 +766,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) - return -ENOMEM; - blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; @@ -998,14 +990,17 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); list_del(&dc->list); mutex_unlock(&bch_register_lock); if (!IS_ERR_OR_NULL(dc->bdev)) { - blk_sync_queue(bdev_get_queue(dc->bdev)); + if (dc->bdev->bd_disk) + blk_sync_queue(bdev_get_queue(dc->bdev)); + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } @@ -1027,73 +1022,67 @@ static void cached_dev_flush(struct closure *cl) static int cached_dev_init(struct cached_dev *dc, unsigned block_size) { - int err; + int ret; struct io *io; - - closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + struct request_queue *q = bdev_get_queue(dc->bdev); __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); - - bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - - err = bcache_device_init(&dc->disk, block_size); - if (err) - goto err; - - spin_lock_init(&dc->io_lock); - closure_init_unlocked(&dc->sb_write); INIT_WORK(&dc->detach, cached_dev_detach_finish); + closure_init_unlocked(&dc->sb_write); + INIT_LIST_HEAD(&dc->io_lru); + spin_lock_init(&dc->io_lock); + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; - INIT_LIST_HEAD(&dc->io_lru); - dc->sb_bio.bi_max_vecs = 1; - dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; - for (io = dc->io; io < dc->io + RECENT_IO; io++) { list_add(&io->lru, &dc->io_lru); hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - bch_writeback_init_cached_dev(dc); + ret = bcache_device_init(&dc->disk, block_size); + if (ret) + return ret; + + set_capacity(dc->disk.disk, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + dc->disk.disk->queue->backing_dev_info.ra_pages = + max(dc->disk.disk->queue->backing_dev_info.ra_pages, + q->backing_dev_info.ra_pages); + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); return 0; -err: - bcache_device_stop(&dc->disk); - return err; } /* Cached device - bcache superblock */ -static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, +static void register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - struct gendisk *g; struct cache_set *c; - if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) - return err; - memcpy(&dc->sb, sb, sizeof(struct cache_sb)); - dc->sb_bio.bi_io_vec[0].bv_page = sb_page; dc->bdev = bdev; dc->bdev->bd_holder = dc; - g = dc->disk.disk; - - set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - - g->queue->backing_dev_info.ra_pages = - max(g->queue->backing_dev_info.ra_pages, - bdev->bd_queue->backing_dev_info.ra_pages); + bio_init(&dc->sb_bio); + dc->sb_bio.bi_max_vecs = 1; + dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); - bch_cached_dev_request_init(dc); + if (cached_dev_init(dc, sb->block_size << 9)) + goto err; err = "error creating kobject"; if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, @@ -1102,6 +1091,8 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; + pr_info("registered backing device %s", bdevname(bdev, name)); + list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c); @@ -1110,15 +1101,10 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return NULL; + return; err: - kobject_put(&dc->disk.kobj); pr_notice("error opening %s: %s", bdevname(bdev, name), err); - /* - * Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; + bcache_device_stop(&dc->disk); } /* Flash only volumes */ @@ -1716,20 +1702,11 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) size_t free; struct bucket *b; - if (!ca) - return -ENOMEM; - __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - memcpy(&ca->sb, sb, sizeof(struct cache_sb)); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->sb_bio); - ca->sb_bio.bi_max_vecs = 1; - ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; @@ -1741,18 +1718,17 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || - !(ca->buckets = vmalloc(sizeof(struct bucket) * + !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) - goto err; + return -ENOMEM; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); for_each_bucket(b, ca) atomic_set(&b->pin, 0); @@ -1765,22 +1741,28 @@ err: return -ENOMEM; } -static const char *register_cache(struct cache_sb *sb, struct page *sb_page, +static void register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - if (cache_alloc(sb, ca) != 0) - return err; - - ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; + bio_init(&ca->sb_bio); + ca->sb_bio.bi_max_vecs = 1; + ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); + if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); + if (cache_alloc(sb, ca) != 0) + goto err; + err = "error creating kobject"; if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; @@ -1790,15 +1772,10 @@ static const char *register_cache(struct cache_sb *sb, struct page *sb_page, goto err; pr_info("registered cache device %s", bdevname(bdev, name)); - - return NULL; + return; err: + pr_notice("error opening %s: %s", bdevname(bdev, name), err); kobject_put(&ca->kobj); - pr_info("error opening %s: %s", bdevname(bdev, name), err); - /* Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; } /* Global interfaces/init */ @@ -1832,12 +1809,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; - - if (IS_ERR(bdev) || - set_blocksize(bdev, 4096)) + if (IS_ERR(bdev)) { + if (bdev == ERR_PTR(-EBUSY)) + err = "device busy"; goto err; + } + + err = "failed to set blocksize"; + if (set_blocksize(bdev, 4096)) + goto err_close; err = read_super(sb, bdev, &sb_page); if (err) @@ -1845,33 +1825,33 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) + goto err_close; - err = register_bdev(sb, sb_page, bdev, dc); + register_bdev(sb, sb_page, bdev, dc); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto err_close; - err = register_cache(sb, sb_page, bdev, ca); + register_cache(sb, sb_page, bdev, ca); } - - if (err) { - /* register_(bdev|cache) will only return an error if they - * didn't get far enough to create the kobject - if they did, - * the kobject destructor will do this cleanup. - */ +out: + if (sb_page) put_page(sb_page); -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); - ret = -EINVAL; - } - kfree(sb); kfree(path); mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; + +err_close: + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +err: + if (attr != &ksysfs_register_quiet) + pr_info("error opening %s: %s", path, err); + ret = -EINVAL; + goto out; } static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 93e7e31a4bd..2714ed3991d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -375,7 +375,7 @@ err: refill_dirty(cl); } -void bch_writeback_init_cached_dev(struct cached_dev *dc) +void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); -- cgit v1.2.3 From 610bba8b9372597967aefdc8d90661d2ab248802 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Sun, 19 May 2013 18:57:50 +0100 Subject: dm thin: fix metadata dev resize detection Fix detection of the need to resize the dm thin metadata device. The code incorrectly tried to extend the metadata device when it didn't need to due to a merging error with patch 24347e9 ("dm thin: detect metadata device resizing"). device-mapper: transaction manager: couldn't open metadata space map device-mapper: thin metadata: tm_open_with_sm failed device-mapper: thin: aborting transaction failed device-mapper: thin: switching pool to failure mode Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 759cffc45ca..88f2f802d52 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2188,7 +2188,7 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) *need_commit = false; - metadata_dev_size = get_metadata_dev_size(pool->md_dev); + metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); if (r) { @@ -2197,7 +2197,7 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) } if (metadata_dev_size < sb_metadata_dev_size) { - DMERR("metadata device (%llu sectors) too small: expected %llu", + DMERR("metadata device (%llu blocks) too small: expected %llu", metadata_dev_size, sb_metadata_dev_size); return -EINVAL; -- cgit v1.2.3 From 4997b72ee62930cb841d185398ea547d979789f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 30 May 2013 08:44:39 +0200 Subject: raid5: Initialize bi_vcnt The patch that converted raid5 to use bio_reset() forgot to initialize bi_vcnt. Signed-off-by: Kent Overstreet Cc: NeilBrown Cc: linux-raid@vger.kernel.org Tested-by: Ilia Mirkin Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9359828ffe2..753f318c898 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -664,6 +664,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_FLUSH; + bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; @@ -701,6 +702,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_sector = (sh->sector + rrdev->data_offset); + rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; -- cgit v1.2.3 From 6b6204ee92adb53bfd6a77cb5679282ec3820c4b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 May 2013 09:48:30 +1000 Subject: md: md_stop_writes() should always freeze recovery. __md_stop_writes() will currently sometimes freeze recovery. So any caller must be ready for that to happen, and indeed they are. However if __md_stop_writes() doesn't freeze_recovery, then a recovery could start before mddev_suspend() is called, which could be awkward. This can particularly cause problems or dm-raid. So change __md_stop_writes() to always freeze recovery. This is safe and more predicatable. Reported-by: Brassow Jonathan Tested-by: Brassow Jonathan Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 4c74424c78b..3e2acfa1696 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5277,8 +5277,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } -- cgit v1.2.3 From 3056e3aec8d8ba61a0710fb78b2d562600aa2ea7 Mon Sep 17 00:00:00 2001 From: Alex Lyakas Date: Tue, 4 Jun 2013 20:42:21 +0300 Subject: md/raid1: consider WRITE as successful only if at least one non-Faulty and non-rebuilding drive completed it. Without that fix, the following scenario could happen: - RAID1 with drives A and B; drive B was freshly-added and is rebuilding - Drive A fails - WRITE request arrives to the array. It is failed by drive A, so r1_bio is marked as R1BIO_WriteError, but the rebuilding drive B succeeds in writing it, so the same r1_bio is marked as R1BIO_Uptodate. - r1_bio arrives to handle_write_finished, badblocks are disabled, md_error()->error() does nothing because we don't fail the last drive of raid1 - raid_end_bio_io() calls call_bio_endio() - As a result, in call_bio_endio(): if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) clear_bit(BIO_UPTODATE, &bio->bi_flags); this code doesn't clear the BIO_UPTODATE flag, and the whole master WRITE succeeds, back to the upper layer. So we returned success to the upper layer, even though we had written the data onto the rebuilding drive only. But when we want to read the data back, we would not read from the rebuilding drive, so this data is lost. [neilb - applied identical change to raid10 as well] This bug can result in lost data, so it is suitable for any -stable kernel. Cc: stable@vger.kernel.org Signed-off-by: Alex Lyakas Signed-off-by: NeilBrown --- drivers/md/raid1.c | 12 +++++++++++- drivers/md/raid10.c | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 851023e2ba5..f2db7a9d596 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -427,7 +427,17 @@ static void raid1_end_write_request(struct bio *bio, int error) r1_bio->bios[mirror] = NULL; to_put = bio; - set_bit(R1BIO_Uptodate, &r1_bio->state); + /* + * Do not set R1BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(conf->mirrors[mirror].rdev, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 018741ba931..8000ee25650 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -490,7 +490,17 @@ static void raid10_end_write_request(struct bio *bio, int error) sector_t first_bad; int bad_sectors; - set_bit(R10BIO_Uptodate, &r10_bio->state); + /* + * Do not set R10BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(rdev, -- cgit v1.2.3 From e2d59925221cd562e07fee38ec8839f7209ae603 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 12 Jun 2013 11:01:22 +1000 Subject: md/raid1,raid10: use freeze_array in place of raise_barrier in various places. Various places in raid1 and raid10 are calling raise_barrier when they really should call freeze_array. The former is only intended to be called from "make_request". The later has extra checks for 'nr_queued' and makes a call to flush_pending_writes(), so it is safe to call it from within the management thread. Using raise_barrier will sometimes deadlock. Using freeze_array should not. As 'freeze_array' currently expects one request to be pending (in handle_read_error - the only previous caller), we need to pass it the number of pending requests (extra) to ignore. The deadlock was made particularly noticeable by commits 050b66152f87c7 (raid10) and 6b740b8d79252f13 (raid1) which appeared in 3.4, so the fix is appropriate for any -stable kernel since then. This patch probably won't apply directly to some early kernels and will need to be applied by hand. Cc: stable@vger.kernel.org Reported-by: Alexander Lyakas Signed-off-by: NeilBrown --- drivers/md/raid1.c | 22 +++++++++++----------- drivers/md/raid10.c | 14 +++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f2db7a9d596..5208e9d1aff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -890,17 +890,17 @@ static void allow_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r1conf *conf) +static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -908,7 +908,7 @@ static void freeze_array(struct r1conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); @@ -1568,8 +1568,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); @@ -1619,11 +1619,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) */ struct md_rdev *repl = conf->mirrors[conf->raid_disks + number].rdev; - raise_barrier(conf); + freeze_array(conf, 0); clear_bit(Replacement, &repl->flags); p->rdev = repl; conf->mirrors[conf->raid_disks + number].rdev = NULL; - lower_barrier(conf); + unfreeze_array(conf); clear_bit(WantReplacement, &rdev->flags); } else clear_bit(WantReplacement, &rdev->flags); @@ -2240,7 +2240,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) * frozen */ if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); @@ -3020,7 +3020,7 @@ static int raid1_reshape(struct mddev *mddev) return -ENOMEM; } - raise_barrier(conf); + freeze_array(conf, 0); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; @@ -3051,7 +3051,7 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - lower_barrier(conf); + unfreeze_array(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8000ee25650..aa9ed304951 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1065,17 +1065,17 @@ static void allow_barrier(struct r10conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r10conf *conf) +static void freeze_array(struct r10conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -1083,7 +1083,7 @@ static void freeze_array(struct r10conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); @@ -1849,8 +1849,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf, 0); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); @@ -2646,7 +2646,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) r10_bio->devs[slot].bio = NULL; if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } else -- cgit v1.2.3 From 5026d7a9b2f3eb1f9bda66c18ac6bc3036ec9020 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 12 Jun 2013 07:37:43 -0700 Subject: md/raid1,5,10: Disable WRITE SAME until a recovery strategy is in place There are cases where the kernel will believe that the WRITE SAME command is supported by a block device which does not, in fact, support WRITE SAME. This currently happens for SATA drivers behind a SAS controller, but there are probably a hundred other ways that can happen, including drive firmware bugs. After receiving an error for WRITE SAME the block layer will retry the request as a plain write of zeroes, but mdraid will consider the failure as fatal and consider the drive failed. This has the effect that all the mirrors containing a specific set of data are each offlined in very rapid succession resulting in data loss. However, just bouncing the request back up to the block layer isn't ideal either, because the whole initial request-retry sequence should be inside the write bitmap fence, which probably means that md needs to do its own conversion of WRITE SAME to write zero. Until the failure scenario has been sorted out, disable WRITE SAME for raid1, raid5, and raid10. [neilb: added raid5] This patch is appropriate for any -stable since 3.7 when write_same support was added. Cc: stable@vger.kernel.org Signed-off-by: H. Peter Anvin Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 3 +-- drivers/md/raid5.c | 4 +++- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5208e9d1aff..e02ad445090 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2837,8 +2837,8 @@ static int run(struct mddev *mddev) return PTR_ERR(conf); if (mddev->queue) - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aa9ed304951..06c2cbe046e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3651,8 +3651,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4a7be455d6d..26ee39936a2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5465,7 +5465,7 @@ static int run(struct mddev *mddev) if (mddev->major_version == 0 && mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - + if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5588,6 +5588,8 @@ static int run(struct mddev *mddev) */ mddev->queue->limits.discard_zeroes_data = 0; + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); -- cgit v1.2.3