diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 34 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 185 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 24 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 100 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 36 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 7 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 200 | ||||
-rw-r--r-- | drivers/md/md.c | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 3 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 127 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map.h | 23 | ||||
-rw-r--r-- | drivers/md/raid1.c | 38 | ||||
-rw-r--r-- | drivers/md/raid10.c | 29 | ||||
-rw-r--r-- | drivers/md/raid5.c | 6 |
23 files changed, 576 insertions, 266 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 05c220d05e2..f950c9d29f3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,6 @@ config BCACHE tristate "Block device as cache" - select CLOSURES ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 340146d7c17..d3e15b42a4a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1241,7 +1241,7 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_writeback_init_cached_dev(struct cached_dev *); +void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); void bch_cache_allocator_exit(struct cache *ca); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 64e679449c2..b8730e714d6 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -93,24 +93,6 @@ static struct attribute *bch_stats_files[] = { }; static KTYPE(bch_stats); -static void scale_accounting(unsigned long data); - -void bch_cache_accounting_init(struct cache_accounting *acc, - struct closure *parent) -{ - kobject_init(&acc->total.kobj, &bch_stats_ktype); - kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); - kobject_init(&acc->hour.kobj, &bch_stats_ktype); - kobject_init(&acc->day.kobj, &bch_stats_ktype); - - closure_init(&acc->cl, parent); - init_timer(&acc->timer); - acc->timer.expires = jiffies + accounting_delay; - acc->timer.data = (unsigned long) acc; - acc->timer.function = scale_accounting; - add_timer(&acc->timer); -} - int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, struct kobject *parent) { @@ -244,3 +226,19 @@ void bch_mark_sectors_bypassed(struct search *s, int sectors) atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); } + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + init_timer(&acc->timer); + acc->timer.expires = jiffies + accounting_delay; + acc->timer.data = (unsigned long) acc; + acc->timer.function = scale_accounting; + add_timer(&acc->timer); +} diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c8046bc4aa5..f88e2b653a3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -634,11 +634,10 @@ static int open_dev(struct block_device *b, fmode_t mode) return 0; } -static int release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b, fmode_t mode) { struct bcache_device *d = b->private_data; closure_put(&d->cl); - return 0; } static int ioctl_dev(struct block_device *b, fmode_t mode, @@ -732,8 +731,7 @@ static void bcache_device_free(struct bcache_device *d) if (d->c) bcache_device_detach(d); - - if (d->disk) + if (d->disk && d->disk->flags & GENHD_FL_UP) del_gendisk(d->disk); if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); @@ -756,12 +754,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || - bio_split_pool_init(&d->bio_split_hook)) - - return -ENOMEM; - - d->disk = alloc_disk(1); - if (!d->disk) + bio_split_pool_init(&d->bio_split_hook) || + !(d->disk = alloc_disk(1)) || + !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); @@ -771,10 +766,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) - return -ENOMEM; - blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; @@ -999,14 +990,17 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); list_del(&dc->list); mutex_unlock(&bch_register_lock); if (!IS_ERR_OR_NULL(dc->bdev)) { - blk_sync_queue(bdev_get_queue(dc->bdev)); + if (dc->bdev->bd_disk) + blk_sync_queue(bdev_get_queue(dc->bdev)); + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } @@ -1028,73 +1022,67 @@ static void cached_dev_flush(struct closure *cl) static int cached_dev_init(struct cached_dev *dc, unsigned block_size) { - int err; + int ret; struct io *io; - - closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + struct request_queue *q = bdev_get_queue(dc->bdev); __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); - - bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - - err = bcache_device_init(&dc->disk, block_size); - if (err) - goto err; - - spin_lock_init(&dc->io_lock); - closure_init_unlocked(&dc->sb_write); INIT_WORK(&dc->detach, cached_dev_detach_finish); + closure_init_unlocked(&dc->sb_write); + INIT_LIST_HEAD(&dc->io_lru); + spin_lock_init(&dc->io_lock); + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; - INIT_LIST_HEAD(&dc->io_lru); - dc->sb_bio.bi_max_vecs = 1; - dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; - for (io = dc->io; io < dc->io + RECENT_IO; io++) { list_add(&io->lru, &dc->io_lru); hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - bch_writeback_init_cached_dev(dc); + ret = bcache_device_init(&dc->disk, block_size); + if (ret) + return ret; + + set_capacity(dc->disk.disk, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + dc->disk.disk->queue->backing_dev_info.ra_pages = + max(dc->disk.disk->queue->backing_dev_info.ra_pages, + q->backing_dev_info.ra_pages); + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); return 0; -err: - bcache_device_stop(&dc->disk); - return err; } /* Cached device - bcache superblock */ -static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, +static void register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - struct gendisk *g; struct cache_set *c; - if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) - return err; - memcpy(&dc->sb, sb, sizeof(struct cache_sb)); - dc->sb_bio.bi_io_vec[0].bv_page = sb_page; dc->bdev = bdev; dc->bdev->bd_holder = dc; - g = dc->disk.disk; - - set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - - g->queue->backing_dev_info.ra_pages = - max(g->queue->backing_dev_info.ra_pages, - bdev->bd_queue->backing_dev_info.ra_pages); + bio_init(&dc->sb_bio); + dc->sb_bio.bi_max_vecs = 1; + dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); - bch_cached_dev_request_init(dc); + if (cached_dev_init(dc, sb->block_size << 9)) + goto err; err = "error creating kobject"; if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, @@ -1103,6 +1091,8 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; + pr_info("registered backing device %s", bdevname(bdev, name)); + list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c); @@ -1111,15 +1101,10 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return NULL; + return; err: - kobject_put(&dc->disk.kobj); pr_notice("error opening %s: %s", bdevname(bdev, name), err); - /* - * Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; + bcache_device_stop(&dc->disk); } /* Flash only volumes */ @@ -1717,20 +1702,11 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) size_t free; struct bucket *b; - if (!ca) - return -ENOMEM; - __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - memcpy(&ca->sb, sb, sizeof(struct cache_sb)); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->sb_bio); - ca->sb_bio.bi_max_vecs = 1; - ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; @@ -1742,18 +1718,17 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || - !(ca->buckets = vmalloc(sizeof(struct bucket) * + !(ca->buckets = vzalloc(sizeof(struct bucket) * ca->sb.nbuckets)) || !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) - goto err; + return -ENOMEM; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); for_each_bucket(b, ca) atomic_set(&b->pin, 0); @@ -1766,22 +1741,28 @@ err: return -ENOMEM; } -static const char *register_cache(struct cache_sb *sb, struct page *sb_page, +static void register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; - if (cache_alloc(sb, ca) != 0) - return err; - - ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; + bio_init(&ca->sb_bio); + ca->sb_bio.bi_max_vecs = 1; + ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); + if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); + if (cache_alloc(sb, ca) != 0) + goto err; + err = "error creating kobject"; if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) goto err; @@ -1791,15 +1772,10 @@ static const char *register_cache(struct cache_sb *sb, struct page *sb_page, goto err; pr_info("registered cache device %s", bdevname(bdev, name)); - - return NULL; + return; err: + pr_notice("error opening %s: %s", bdevname(bdev, name), err); kobject_put(&ca->kobj); - pr_info("error opening %s: %s", bdevname(bdev, name), err); - /* Return NULL instead of an error because kobject_put() cleans - * everything up - */ - return NULL; } /* Global interfaces/init */ @@ -1833,12 +1809,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; - - if (IS_ERR(bdev) || - set_blocksize(bdev, 4096)) + if (IS_ERR(bdev)) { + if (bdev == ERR_PTR(-EBUSY)) + err = "device busy"; goto err; + } + + err = "failed to set blocksize"; + if (set_blocksize(bdev, 4096)) + goto err_close; err = read_super(sb, bdev, &sb_page); if (err) @@ -1846,33 +1825,33 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) + goto err_close; - err = register_bdev(sb, sb_page, bdev, dc); + register_bdev(sb, sb_page, bdev, dc); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto err_close; - err = register_cache(sb, sb_page, bdev, ca); + register_cache(sb, sb_page, bdev, ca); } - - if (err) { - /* register_(bdev|cache) will only return an error if they - * didn't get far enough to create the kobject - if they did, - * the kobject destructor will do this cleanup. - */ +out: + if (sb_page) put_page(sb_page); -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); - ret = -EINVAL; - } - kfree(sb); kfree(path); mutex_unlock(&bch_register_lock); module_put(THIS_MODULE); return ret; + +err_close: + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +err: + if (attr != &ksysfs_register_quiet) + pr_info("error opening %s: %s", path, err); + ret = -EINVAL; + goto out; } static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 93e7e31a4bd..2714ed3991d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -375,7 +375,7 @@ err: refill_dirty(cl); } -void bch_writeback_init_cached_dev(struct cached_dev *dc) +void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c6083132c4b..0387e05cdb9 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -319,6 +319,9 @@ static void __cache_size_refresh(void) static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, enum data_mode *data_mode) { + unsigned noio_flag; + void *ptr; + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { *data_mode = DATA_MODE_SLAB; return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); @@ -332,7 +335,26 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, } *data_mode = DATA_MODE_VMALLOC; - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + + /* + * __vmalloc allocates the data pages and auxiliary structures with + * gfp_flags that were specified, but pagetables are always allocated + * with GFP_KERNEL, no matter what was specified as gfp_mask. + * + * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that + * all allocations done by this process (including pagetables) are done + * as if GFP_NOIO was specified. + */ + + if (gfp_mask & __GFP_NORETRY) + noio_flag = memalloc_noio_save(); + + ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + + if (gfp_mask & __GFP_NORETRY) + memalloc_noio_restore(noio_flag); + + return ptr; } /* diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 83e995fece8..1af7255bbff 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1044,7 +1044,7 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { down_read(&cmd->root_lock); - memcpy(stats, &cmd->stats, sizeof(*stats)); + *stats = cmd->stats; up_read(&cmd->root_lock); } @@ -1052,7 +1052,7 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { down_write(&cmd->root_lock); - memcpy(&cmd->stats, stats, sizeof(*stats)); + cmd->stats = *stats; up_write(&cmd->root_lock); } diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 558bdfdabf5..33369ca9614 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -130,8 +130,8 @@ struct dm_cache_policy { * * Must not block. * - * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK - * would be typical). + * Returns 0 if in cache, -ENOENT if not, < 0 for other errors + * (-EWOULDBLOCK would be typical). */ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 10744091e6c..df44b60e66f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -205,7 +205,7 @@ struct per_bio_data { /* * writethrough fields. These MUST remain at the end of this * structure and the 'cache' member must be the first as it - * is used to determine the offsetof the writethrough fields. + * is used to determine the offset of the writethrough fields. */ struct cache *cache; dm_cblock_t cblock; @@ -393,7 +393,7 @@ static int get_cell(struct cache *cache, return r; } - /*----------------------------------------------------------------*/ +/*----------------------------------------------------------------*/ static bool is_dirty(struct cache *cache, dm_cblock_t b) { @@ -419,6 +419,7 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl } /*----------------------------------------------------------------*/ + static bool block_size_is_power_of_two(struct cache *cache) { return cache->sectors_per_block_shift >= 0; @@ -667,7 +668,7 @@ static void writethrough_endio(struct bio *bio, int err) /* * We can't issue this bio directly, since we're in interrupt - * context. So it get's put on a bio list for processing by the + * context. So it gets put on a bio list for processing by the * worker thread. */ defer_writethrough_bio(pb->cache, bio); @@ -1445,6 +1446,7 @@ static void do_worker(struct work_struct *ws) static void do_waker(struct work_struct *ws) { struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy); wake_worker(cache); queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); } @@ -1809,7 +1811,37 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv, static struct kmem_cache *migration_cache; -static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv) +#define NOT_CORE_OPTION 1 + +static int process_config_option(struct cache *cache, const char *key, const char *value) +{ + unsigned long tmp; + + if (!strcasecmp(key, "migration_threshold")) { + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + cache->migration_threshold = tmp; + return 0; + } + + return NOT_CORE_OPTION; +} + +static int set_config_value(struct cache *cache, const char *key, const char *value) +{ + int r = process_config_option(cache, key, value); + + if (r == NOT_CORE_OPTION) + r = policy_set_config_value(cache->policy, key, value); + + if (r) + DMWARN("bad config value for %s: %s", key, value); + + return r; +} + +static int set_config_values(struct cache *cache, int argc, const char **argv) { int r = 0; @@ -1819,12 +1851,9 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a } while (argc) { - r = policy_set_config_value(p, argv[0], argv[1]); - if (r) { - DMWARN("policy_set_config_value failed: key = '%s', value = '%s'", - argv[0], argv[1]); - return r; - } + r = set_config_value(cache, argv[0], argv[1]); + if (r) + break; argc -= 2; argv += 2; @@ -1836,8 +1865,6 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a static int create_cache_policy(struct cache *cache, struct cache_args *ca, char **error) { - int r; - cache->policy = dm_cache_policy_create(ca->policy_name, cache->cache_size, cache->origin_sectors, @@ -1847,14 +1874,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return -ENOMEM; } - r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) { - *error = "Error setting cache policy's config values"; - dm_cache_policy_destroy(cache->policy); - cache->policy = NULL; - } - - return r; + return 0; } /* @@ -1886,7 +1906,7 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, return discard_block_size; } -#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) +#define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) { @@ -1911,7 +1931,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->discards_supported = true; ti->discard_zeroes_data_unsupported = true; - memcpy(&cache->features, &ca->features, sizeof(cache->features)); + cache->features = ca->features; ti->per_bio_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; @@ -1948,7 +1968,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) r = create_cache_policy(cache, ca, error); if (r) goto bad; + cache->policy_nr_args = ca->policy_argc; + cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; + + r = set_config_values(cache, ca->policy_argc, ca->policy_argv); + if (r) { + *error = "Error setting cache policy's config values"; + goto bad; + } cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, ca->block_size, may_format, @@ -1967,10 +1995,10 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); - cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); + r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); if (!cache->dirty_bitset) { @@ -2517,23 +2545,6 @@ err: DMEMIT("Error"); } -#define NOT_CORE_OPTION 1 - -static int process_config_option(struct cache *cache, char **argv) -{ - unsigned long tmp; - - if (!strcasecmp(argv[0], "migration_threshold")) { - if (kstrtoul(argv[1], 10, &tmp)) - return -EINVAL; - - cache->migration_threshold = tmp; - return 0; - } - - return NOT_CORE_OPTION; -} - /* * Supports <key> <value>. * @@ -2541,17 +2552,12 @@ static int process_config_option(struct cache *cache, char **argv) */ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) { - int r; struct cache *cache = ti->private; if (argc != 2) return -EINVAL; - r = process_config_option(cache, argv); - if (r == NOT_CORE_OPTION) - return policy_set_config_value(cache->policy, argv[0], argv[1]); - - return r; + return set_config_value(cache, argv[0], argv[1]); } static int cache_iterate_devices(struct dm_target *ti, @@ -2609,7 +2615,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 1, 0}, + .version = {1, 1, 1}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 51bb81676be..bdf26f5bd32 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -907,6 +907,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; return 0; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c0e07026a8d..c434e5aab2d 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1121,6 +1121,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); if (!s->pending_pool) { ti->error = "Could not allocate mempool for pending exceptions"; + r = -ENOMEM; goto bad_pending_pool; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index ea5e878a30b..d907ca6227c 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -94,7 +94,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct stripe_c *sc; - sector_t width; + sector_t width, tmp_len; uint32_t stripes; uint32_t chunk_size; int r; @@ -116,15 +116,16 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } width = ti->len; - if (sector_div(width, chunk_size)) { + if (sector_div(width, stripes)) { ti->error = "Target length not divisible by " - "chunk size"; + "number of stripes"; return -EINVAL; } - if (sector_div(width, stripes)) { + tmp_len = width; + if (sector_div(tmp_len, chunk_size)) { ti->error = "Target length not divisible by " - "number of stripes"; + "chunk size"; return -EINVAL; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e50dad0c65f..1ff252ab7d4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1442,7 +1442,7 @@ static bool dm_table_supports_write_same(struct dm_table *t) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) + ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) return false; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 00cee02f8fc..60bce435f4f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1645,12 +1645,12 @@ int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, return r; } -static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) { int r; dm_block_t old_count; - r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); + r = dm_sm_get_nr_blocks(sm, &old_count); if (r) return r; @@ -1658,11 +1658,11 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) return 0; if (new_count < old_count) { - DMERR("cannot reduce size of data device"); + DMERR("cannot reduce size of space map"); return -EINVAL; } - return dm_sm_extend(pmd->data_sm, new_count - old_count); + return dm_sm_extend(sm, new_count - old_count); } int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) @@ -1671,7 +1671,19 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) down_write(&pmd->root_lock); if (!pmd->fail_io) - r = __resize_data_dev(pmd, new_count); + r = __resize_space_map(pmd->data_sm, new_count); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __resize_space_map(pmd->metadata_sm, new_count); up_write(&pmd->root_lock); return r; @@ -1684,3 +1696,17 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) dm_bm_set_read_only(pmd->bm); up_write(&pmd->root_lock); } + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + int r; + + down_write(&pmd->root_lock); + r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); + up_write(&pmd->root_lock); + + return r; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 0cecc370288..845ebbe589a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -8,6 +8,7 @@ #define DM_THIN_METADATA_H #include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map.h" #define THIN_METADATA_BLOCK_SIZE 4096 @@ -185,6 +186,7 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); * blocks would be lost. */ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); /* * Flicks the underlying block manager into read only mode, so you know @@ -192,6 +194,11 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 004ad1652b7..88f2f802d52 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -922,7 +922,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) return r; if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { - DMWARN("%s: reached low water mark, sending event.", + DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->low_water_triggered = 1; @@ -1281,6 +1281,10 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +/* + * FIXME: should we also commit due to size of transaction, measured in + * metadata blocks? + */ static int need_commit_due_to_time(struct pool *pool) { return jiffies < pool->last_commit_jiffies || @@ -1909,6 +1913,56 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, return r; } +static void metadata_low_callback(void *context) +{ + struct pool *pool = context; + + DMWARN("%s: reached low water mark for metadata device: sending event.", + dm_device_name(pool->pool_md)); + + dm_table_event(pool->ti->table); +} + +static sector_t get_metadata_dev_size(struct block_device *bdev) +{ + sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + char buffer[BDEVNAME_SIZE]; + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); + metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; + } + + return metadata_dev_size; +} + +static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_metadata_dev_size(bdev); + + sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + + return metadata_dev_size; +} + +/* + * When a metadata threshold is crossed a dm event is triggered, and + * userland should respond by growing the metadata device. We could let + * userland set the threshold, like we do with the data threshold, but I'm + * not sure they know enough to do this well. + */ +static dm_block_t calc_metadata_threshold(struct pool_c *pt) +{ + /* + * 4M is ample for all ops with the possible exception of thin + * device deletion which is harmless if it fails (just retry the + * delete after you've grown the device). + */ + dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; + return min((dm_block_t)1024ULL /* 4M */, quarter); +} + /* * thin-pool <metadata dev> <data dev> * <data block size (sectors)> @@ -1931,8 +1985,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned long block_size; dm_block_t low_water_blocks; struct dm_dev *metadata_dev; - sector_t metadata_dev_size; - char b[BDEVNAME_SIZE]; + fmode_t metadata_mode; /* * FIXME Remove validation from scope of lock. @@ -1944,19 +1997,32 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto out_unlock; } + as.argc = argc; as.argv = argv; - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); + /* + * Set default pool features. + */ + pool_features_init(&pf); + + dm_consume_args(&as, 4); + r = parse_pool_features(&as, &pf, ti); + if (r) + goto out_unlock; + + metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); if (r) { ti->error = "Error opening metadata block device"; goto out_unlock; } - metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", - bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); + /* + * Run for the side-effect of possibly issuing a warning if the + * device is too big. + */ + (void) get_metadata_dev_size(metadata_dev->bdev); r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); if (r) { @@ -1979,16 +2045,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; } - /* - * Set default pool features. - */ - pool_features_init(&pf); - - dm_consume_args(&as, 4); - r = parse_pool_features(&as, &pf, ti); - if (r) - goto out; - pt = kzalloc(sizeof(*pt), GFP_KERNEL); if (!pt) { r = -ENOMEM; @@ -2040,6 +2096,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = pt; + r = dm_pool_register_metadata_threshold(pt->pool->pmd, + calc_metadata_threshold(pt), + metadata_low_callback, + pool); + if (r) + goto out_free_pt; + pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -2079,18 +2142,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio) return r; } -/* - * Retrieves the number of blocks of the data device from - * the superblock and compares it to the actual device size, - * thus resizing the data device in case it has grown. - * - * This both copes with opening preallocated data devices in the ctr - * being followed by a resume - * -and- - * calling the resume method individually after userspace has - * grown the data device in reaction to a table event. - */ -static int pool_preresume(struct dm_target *ti) +static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) { int r; struct pool_c *pt = ti->private; @@ -2098,12 +2150,7 @@ static int pool_preresume(struct dm_target *ti) sector_t data_size = ti->len; dm_block_t sb_data_size; - /* - * Take control of the pool object. - */ - r = bind_control_target(pool, ti); - if (r) - return r; + *need_commit = false; (void) sector_div(data_size, pool->sectors_per_block); @@ -2114,7 +2161,7 @@ static int pool_preresume(struct dm_target *ti) } if (data_size < sb_data_size) { - DMERR("pool target too small, is %llu blocks (expected %llu)", + DMERR("pool target (%llu blocks) too small: expected %llu", (unsigned long long)data_size, sb_data_size); return -EINVAL; @@ -2122,17 +2169,90 @@ static int pool_preresume(struct dm_target *ti) r = dm_pool_resize_data_dev(pool->pmd, data_size); if (r) { DMERR("failed to resize data device"); - /* FIXME Stricter than necessary: Rollback transaction instead here */ set_pool_mode(pool, PM_READ_ONLY); return r; } - (void) commit_or_fallback(pool); + *need_commit = true; } return 0; } +static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + dm_block_t metadata_dev_size, sb_metadata_dev_size; + + *need_commit = false; + + metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); + + r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); + if (r) { + DMERR("failed to retrieve data device size"); + return r; + } + + if (metadata_dev_size < sb_metadata_dev_size) { + DMERR("metadata device (%llu blocks) too small: expected %llu", + metadata_dev_size, sb_metadata_dev_size); + return -EINVAL; + + } else if (metadata_dev_size > sb_metadata_dev_size) { + r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); + if (r) { + DMERR("failed to resize metadata device"); + return r; + } + + *need_commit = true; + } + + return 0; +} + +/* + * Retrieves the number of blocks of the data device from + * the superblock and compares it to the actual device size, + * thus resizing the data device in case it has grown. + * + * This both copes with opening preallocated data devices in the ctr + * being followed by a resume + * -and- + * calling the resume method individually after userspace has + * grown the data device in reaction to a table event. + */ +static int pool_preresume(struct dm_target *ti) +{ + int r; + bool need_commit1, need_commit2; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * Take control of the pool object. + */ + r = bind_control_target(pool, ti); + if (r) + return r; + + r = maybe_resize_data_dev(ti, &need_commit1); + if (r) + return r; + + r = maybe_resize_metadata_dev(ti, &need_commit2); + if (r) + return r; + + if (need_commit1 || need_commit2) + (void) commit_or_fallback(pool); + + return 0; +} + static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -2549,7 +2669,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, diff --git a/drivers/md/md.c b/drivers/md/md.c index 681d1099a2d..9b82377a833 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5268,8 +5268,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index f6d29e614ab..e735a6d5a79 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -248,7 +248,8 @@ static struct dm_space_map ops = { .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, - .copy_root = sm_disk_copy_root + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL }; struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 906cf3df71a..1c959684cae 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -17,6 +17,55 @@ /*----------------------------------------------------------------*/ /* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* * Space map interface. * * The low level disk format is written using the standard btree and @@ -54,6 +103,8 @@ struct sm_metadata { unsigned allocated_this_transaction; unsigned nr_uncommitted; struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; + + struct threshold threshold; }; static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) @@ -144,12 +195,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm) kfree(smm); } -static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("doesn't support extend"); - return -EINVAL; -} - static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); @@ -335,9 +380,19 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) { + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + int r = sm_metadata_new_block_(sm, b); if (r) DMERR("unable to allocate new metadata block"); + + r = sm_metadata_get_nr_free(sm, &count); + if (r) + DMERR("couldn't get free block count"); + + check_threshold(&smm->threshold, count); + return r; } @@ -357,6 +412,18 @@ static int sm_metadata_commit(struct dm_space_map *sm) return 0; } +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) { *result = sizeof(struct disk_sm_root); @@ -382,6 +449,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t return 0; } +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + static struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, @@ -395,7 +464,8 @@ static struct dm_space_map ops = { .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, - .copy_root = sm_metadata_copy_root + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback }; /*----------------------------------------------------------------*/ @@ -410,7 +480,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm) static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - DMERR("boostrap doesn't support extend"); + DMERR("bootstrap doesn't support extend"); return -EINVAL; } @@ -450,7 +520,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { - DMERR("boostrap doesn't support set_count"); + DMERR("bootstrap doesn't support set_count"); return -EINVAL; } @@ -491,7 +561,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm) static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) { - DMERR("boostrap doesn't support root_size"); + DMERR("bootstrap doesn't support root_size"); return -EINVAL; } @@ -499,7 +569,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, size_t max) { - DMERR("boostrap doesn't support copy_root"); + DMERR("bootstrap doesn't support copy_root"); return -EINVAL; } @@ -517,11 +587,42 @@ static struct dm_space_map bootstrap_ops = { .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, - .copy_root = sm_bootstrap_copy_root + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL }; /*----------------------------------------------------------------*/ +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r, i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + + /* + * Switch back to normal behaviour. + */ + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + for (i = old_len; !r && i < smm->begin; i++) + r = sm_ll_inc(&smm->ll, i, &ev); + + return r; +} + +/*----------------------------------------------------------------*/ + struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; @@ -549,6 +650,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, smm->recursion_count = 0; smm->allocated_this_transaction = 0; smm->nr_uncommitted = 0; + threshold_init(&smm->threshold); memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -590,6 +692,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, smm->recursion_count = 0; smm->allocated_this_transaction = 0; smm->nr_uncommitted = 0; + threshold_init(&smm->threshold); memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); return 0; diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 1cbfc6b1638..3e6d1153b7c 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -9,6 +9,8 @@ #include "dm-block-manager.h" +typedef void (*dm_sm_threshold_fn)(void *context); + /* * struct dm_space_map keeps a record of how many times each block in a device * is referenced. It needs to be fixed on disk as part of the transaction. @@ -59,6 +61,15 @@ struct dm_space_map { */ int (*root_size)(struct dm_space_map *sm, size_t *result); int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); }; /*----------------------------------------------------------------*/ @@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le return sm->copy_root(sm, copy_to_here_le, len); } +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + #endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 55951182af7..6e17f8181c4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -417,7 +417,17 @@ static void raid1_end_write_request(struct bio *bio, int error) r1_bio->bios[mirror] = NULL; to_put = bio; - set_bit(R1BIO_Uptodate, &r1_bio->state); + /* + * Do not set R1BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(conf->mirrors[mirror].rdev, @@ -870,17 +880,17 @@ static void allow_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r1conf *conf) +static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -888,7 +898,7 @@ static void freeze_array(struct r1conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); @@ -1544,8 +1554,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); @@ -1595,11 +1605,11 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) */ struct md_rdev *repl = conf->mirrors[conf->raid_disks + number].rdev; - raise_barrier(conf); + freeze_array(conf, 0); clear_bit(Replacement, &repl->flags); p->rdev = repl; conf->mirrors[conf->raid_disks + number].rdev = NULL; - lower_barrier(conf); + unfreeze_array(conf); clear_bit(WantReplacement, &rdev->flags); } else clear_bit(WantReplacement, &rdev->flags); @@ -2195,7 +2205,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) * frozen */ if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); @@ -2780,8 +2790,8 @@ static int run(struct mddev *mddev) return PTR_ERR(conf); if (mddev->queue) - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; @@ -2963,7 +2973,7 @@ static int raid1_reshape(struct mddev *mddev) return -ENOMEM; } - raise_barrier(conf); + freeze_array(conf, 0); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; @@ -2994,7 +3004,7 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - lower_barrier(conf); + unfreeze_array(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 59d4daa5f4c..6ddae2501b9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -490,7 +490,17 @@ static void raid10_end_write_request(struct bio *bio, int error) sector_t first_bad; int bad_sectors; - set_bit(R10BIO_Uptodate, &r10_bio->state); + /* + * Do not set R10BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ if (is_badblock(rdev, @@ -1055,17 +1065,17 @@ static void allow_barrier(struct r10conf *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(struct r10conf *conf) +static void freeze_array(struct r10conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ @@ -1073,7 +1083,7 @@ static void freeze_array(struct r10conf *conf) conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, + conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); @@ -1837,8 +1847,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * we wait for all outstanding requests to complete. */ synchronize_sched(); - raise_barrier(conf, 0); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); @@ -2612,7 +2622,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) r10_bio->devs[slot].bio = NULL; if (mddev->ro == 0) { - freeze_array(conf); + freeze_array(conf, 1); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } else @@ -3609,8 +3619,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, - mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9359828ffe2..05e4a105b9c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -664,6 +664,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_FLUSH; + bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; @@ -701,6 +702,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_sector = (sh->sector + rrdev->data_offset); + rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; @@ -5464,7 +5466,7 @@ static int run(struct mddev *mddev) if (mddev->major_version == 0 && mddev->minor_version > 90) rdev->recovery_offset = reshape_offset; - + if (rdev->recovery_offset < reshape_offset) { /* We need to check old and new layout */ if (!only_parity(rdev->raid_disk, @@ -5587,6 +5589,8 @@ static int run(struct mddev *mddev) */ mddev->queue->limits.discard_zeroes_data = 0; + blk_queue_max_write_same_sectors(mddev->queue, 0); + rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); |