diff options
author | Sumit Semwal <sumit.semwal@linaro.org> | 2016-05-17 15:10:09 +0000 |
---|---|---|
committer | Sumit Semwal <sumit.semwal@linaro.org> | 2016-05-17 15:10:09 +0000 |
commit | 99e17f8a0421fd1958bdee92e7e66c22b89d9556 (patch) | |
tree | 15cacceadb018151ac80cd34c786cef5818d614d /drivers/md | |
parent | 914edc40eafa0dabf561bf29c8e768a4b636e7f5 (diff) | |
parent | 08e85b5fa9b0d262c8d69709088ec7fbbab2ad28 (diff) |
Merge branch 'android-hikey-linaro-4.4' of https://android.googlesource.com/kernel/hikey-linaro into clang-4.4android-hikey-linaro-4.4-clang
Diffstat (limited to 'drivers/md')
29 files changed, 1818 insertions, 428 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 7913fdcfc849..d8b0ab6f3753 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -458,6 +458,18 @@ config DM_VERITY If unsure, say N. +config DM_VERITY_FEC + bool "Verity forward error correction support" + depends on DM_VERITY + select REED_SOLOMON + select REED_SOLOMON_DEC8 + ---help--- + Add forward error correction support to dm-verity. This option + makes it possible to use pre-generated error correction data to + recover from corrupted blocks. + + If unsure, say N. + config DM_SWITCH tristate "Switch target support (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f34979cd141a..62a65764e8e0 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -16,6 +16,7 @@ dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o +dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o raid456-y += raid5.o raid5-cache.o @@ -63,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif + +ifeq ($(CONFIG_DM_VERITY_FEC),y) +dm-verity-objs += dm-verity-fec.o +endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 83392f856dfd..22b9e34ceb75 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c) do { ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); + cond_resched(); if (ret && ret != -EAGAIN) pr_warn("gc failed!"); @@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) + b->seq != seq + 1) { + op->lock = b->level; goto out; + } } SET_KEY_PTRS(check_key, 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 679a093a3bf6..a296425a7270 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || sysfs_create_link(&c->kobj, &d->kobj, d->name), "Couldn't create device <-> cache set symlinks"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } static void bcache_device_detach(struct bcache_device *d) @@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc) buf[SB_LABEL_SIZE] = '\0'; env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); return; + } if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { @@ -1010,8 +1015,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) */ atomic_set(&dc->count, 1); - if (bch_cached_dev_writeback_start(dc)) + /* Block writeback thread, but spawn it */ + down_write(&dc->writeback_lock); + if (bch_cached_dev_writeback_start(dc)) { + up_write(&dc->writeback_lock); return -ENOMEM; + } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(dc); @@ -1023,6 +1032,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bch_cached_dev_run(dc); bcache_device_link(&dc->disk, c, "bdev"); + /* Allow the writeback thread to proceed */ + up_write(&dc->writeback_lock); + pr_info("Caching %s as %s on set %pU", bdevname(dc->bdev, buf), dc->disk.disk->disk_name, dc->disk.c->sb.set_uuid); @@ -1361,6 +1373,9 @@ static void cache_set_flush(struct closure *cl) struct btree *b; unsigned i; + if (!c) + closure_return(cl); + bch_cache_accounting_destroy(&c->accounting); kobject_put(&c->internal); @@ -1823,11 +1838,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) return 0; } -static void register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; - const char *err = "cannot allocate memory"; + const char *err = NULL; + int ret = 0; memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; @@ -1842,27 +1858,35 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); - if (cache_alloc(sb, ca) != 0) + ret = cache_alloc(sb, ca); + if (ret != 0) goto err; - err = "error creating kobject"; - if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) - goto err; + if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) { + err = "error calling kobject_add"; + ret = -ENOMEM; + goto out; + } mutex_lock(&bch_register_lock); err = register_cache_set(ca); mutex_unlock(&bch_register_lock); - if (err) - goto err; + if (err) { + ret = -ENODEV; + goto out; + } pr_info("registered cache device %s", bdevname(bdev, name)); + out: kobject_put(&ca->kobj); - return; + err: - pr_notice("error opening %s: %s", bdevname(bdev, name), err); - goto out; + if (err) + pr_notice("error opening %s: %s", bdevname(bdev, name), err); + + return ret; } /* Global interfaces/init */ @@ -1933,6 +1957,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) + goto out; } goto err; } @@ -1958,7 +1984,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!ca) goto err_close; - register_cache(sb, sb_page, bdev, ca); + if (register_cache(sb, sb_page, bdev, ca) != 0) + goto err_close; } out: if (sb_page) @@ -1971,8 +1998,7 @@ out: err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); + pr_info("error opening %s: %s", path, err); ret = -EINVAL; goto out; } @@ -2066,8 +2092,10 @@ static int __init bcache_init(void) closure_debug_init(); bcache_major = register_blkdev(0, "bcache"); - if (bcache_major < 0) + if (bcache_major < 0) { + unregister_reboot_notifier(&reboot); return bcache_major; + } if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b23f88d9f18c..b9346cd9cda1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { + struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + + BUG_ON(KEY_INODE(k) != dc->disk.id); + return KEY_DIRTY(k); } @@ -372,11 +376,24 @@ next: } } +/* + * Returns true if we scanned the entire disk + */ static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); - bool searched_from_start = false; + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; if (dc->partial_stripes_expensive) { refill_full_stripes(dc); @@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc) return false; } - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - + start_pos = buf->last_scanned; bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; + + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } static int bch_writeback_thread(void *arg) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 0a9dab187b79..073a042aed24 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, static inline void bch_writeback_queue(struct cached_dev *dc) { - wake_up_process(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); } static inline void bch_writeback_add(struct cached_dev *dc) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index f6543f3a970f..3970cda10080 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -867,18 +867,55 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } -#define WRITE_LOCK(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ - return -EINVAL; \ - down_write(&cmd->root_lock) +static bool cmd_write_lock(struct dm_cache_metadata *cmd) +{ + down_write(&cmd->root_lock); + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { + up_write(&cmd->root_lock); + return false; + } + return true; +} + +#define WRITE_LOCK(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return -EINVAL; \ + } while(0) -#define WRITE_LOCK_VOID(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ - return; \ - down_write(&cmd->root_lock) +#define WRITE_LOCK_VOID(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return; \ + } while(0) #define WRITE_UNLOCK(cmd) \ - up_write(&cmd->root_lock) + up_write(&(cmd)->root_lock) + +static bool cmd_read_lock(struct dm_cache_metadata *cmd) +{ + down_read(&cmd->root_lock); + if (cmd->fail_io) { + up_read(&cmd->root_lock); + return false; + } + return true; +} + +#define READ_LOCK(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return -EINVAL; \ + } while(0) + +#define READ_LOCK_VOID(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return; \ + } while(0) + +#define READ_UNLOCK(cmd) \ + up_read(&(cmd)->root_lock) int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { @@ -1015,22 +1052,20 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_discards(cmd, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd) +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result) { - dm_cblock_t r; - - down_read(&cmd->root_lock); - r = cmd->cache_blocks; - up_read(&cmd->root_lock); + READ_LOCK(cmd); + *result = cmd->cache_blocks; + READ_UNLOCK(cmd); - return r; + return 0; } static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) @@ -1188,9 +1223,9 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_mappings(cmd, policy, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1215,18 +1250,18 @@ static int __dump_mappings(struct dm_cache_metadata *cmd) void dm_cache_dump(struct dm_cache_metadata *cmd) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); __dump_mappings(cmd); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = cmd->changed; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1276,9 +1311,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd, void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); *stats = cmd->stats; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, @@ -1312,9 +1347,9 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_free(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1324,9 +1359,9 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1417,7 +1452,13 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) { - return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + int r; + + READ_LOCK(cmd); + r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + READ_UNLOCK(cmd); + + return r; } void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) @@ -1440,10 +1481,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) struct dm_block *sblock; struct cache_disk_superblock *disk_super; - /* - * We ignore fail_io for this function. - */ - down_write(&cmd->root_lock); + WRITE_LOCK(cmd); set_bit(NEEDS_CHECK, &cmd->flags); r = superblock_lock(cmd, &sblock); @@ -1458,19 +1496,17 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) dm_bm_unlock(sblock); out: - up_write(&cmd->root_lock); + WRITE_UNLOCK(cmd); return r; } -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd) +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) { - bool needs_check; + READ_LOCK(cmd); + *result = !!test_bit(NEEDS_CHECK, &cmd->flags); + READ_UNLOCK(cmd); - down_read(&cmd->root_lock); - needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags); - up_read(&cmd->root_lock); - - return needs_check; + return 0; } int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 2ffee21f318d..8528744195e5 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -66,7 +66,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd); * origin blocks to map to. */ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result); int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, sector_t discard_block_size, @@ -137,7 +137,7 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * */ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd); +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2fd4c8296144..bb9b92ebbf8e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct dm_hook_info { bio_end_io_t *bi_end_io; - void *bi_private; }; static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, bio_end_io_t *bi_end_io, void *bi_private) { h->bi_end_io = bio->bi_end_io; - h->bi_private = bio->bi_private; bio->bi_end_io = bi_end_io; bio->bi_private = bi_private; @@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) { bio->bi_end_io = h->bi_end_io; - bio->bi_private = h->bi_private; } /*----------------------------------------------------------------*/ @@ -987,9 +984,14 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) { - bool needs_check = dm_cache_metadata_needs_check(cache->cmd); + bool needs_check; enum cache_metadata_mode old_mode = get_cache_mode(cache); + if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { + DMERR("unable to read needs_check flag, setting failure mode"); + new_mode = CM_FAIL; + } + if (new_mode == CM_WRITE && needs_check) { DMERR("%s: unable to switch cache to write mode until repaired.", cache_device_name(cache)); @@ -3513,6 +3515,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, char buf[BDEVNAME_SIZE]; struct cache *cache = ti->private; dm_cblock_t residency; + bool needs_check; switch (type) { case STATUSTYPE_INFO: @@ -3586,7 +3589,9 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("rw "); - if (dm_cache_metadata_needs_check(cache->cmd)) + r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); + + if (r || needs_check) DMEMIT("needs_check "); else DMEMIT("- "); diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index fae34e7a0b1e..12b5216c2cfe 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -69,7 +69,7 @@ struct dm_exception_store_type { * Update the metadata with this exception. */ void (*commit_exception) (struct dm_exception_store *store, - struct dm_exception *e, + struct dm_exception *e, int valid, void (*callback) (void *, int success), void *callback_context); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3164b8bce294..4d3909393f2c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -695,7 +695,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, } static void persistent_commit_exception(struct dm_exception_store *store, - struct dm_exception *e, + struct dm_exception *e, int valid, void (*callback) (void *, int success), void *callback_context) { @@ -704,6 +704,9 @@ static void persistent_commit_exception(struct dm_exception_store *store, struct core_exception ce; struct commit_callback *cb; + if (!valid) + ps->valid = 0; + ce.old_chunk = e->old_chunk; ce.new_chunk = e->new_chunk; write_exception(ps, ps->current_committed++, &ce); diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 9b7c8c8049d6..4d50a12cf00c 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -52,12 +52,12 @@ static int transient_prepare_exception(struct dm_exception_store *store, } static void transient_commit_exception(struct dm_exception_store *store, - struct dm_exception *e, + struct dm_exception *e, int valid, void (*callback) (void *, int success), void *callback_context) { /* Just succeed */ - callback(callback_context, 1); + callback(callback_context, valid); } static void transient_usage(struct dm_exception_store *store, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c06b74e91cd6..e4d1bafe78c1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -207,7 +207,6 @@ struct dm_snap_pending_exception { */ struct bio *full_bio; bio_end_io_t *full_bio_end_io; - void *full_bio_private; }; /* @@ -1106,6 +1105,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; + dev_t origin_dev, cow_dev; unsigned args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; @@ -1136,11 +1136,19 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get origin device"; goto bad_origin; } + origin_dev = s->origin->bdev->bd_dev; cow_path = argv[0]; argv++; argc--; + cow_dev = dm_get_dev_t(cow_path); + if (cow_dev && cow_dev == origin_dev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_cow; + } + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; @@ -1438,8 +1446,9 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } -static void pending_complete(struct dm_snap_pending_exception *pe, int success) +static void pending_complete(void *context, int success) { + struct dm_snap_pending_exception *pe = context; struct dm_exception *e; struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; @@ -1485,10 +1494,8 @@ out: snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; - if (full_bio) { + if (full_bio) full_bio->bi_end_io = pe->full_bio_end_io; - full_bio->bi_private = pe->full_bio_private; - } increment_pending_exceptions_done_count(); up_write(&s->lock); @@ -1509,24 +1516,13 @@ out: free_pending_exception(pe); } -static void commit_callback(void *context, int success) -{ - struct dm_snap_pending_exception *pe = context; - - pending_complete(pe, success); -} - static void complete_exception(struct dm_snap_pending_exception *pe) { struct dm_snapshot *s = pe->snap; - if (unlikely(pe->copy_error)) - pending_complete(pe, 0); - - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error, + pending_complete, pe); } /* @@ -1605,7 +1601,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - pe->full_bio_private = bio->bi_private; callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 061152a43730..cb5d0daf53bb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -365,6 +365,26 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, } /* + * Convert the path to a device + */ +dev_t dm_get_dev_t(const char *path) +{ + dev_t uninitialized_var(dev); + struct block_device *bdev; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + dev = name_to_dev_t(path); + else { + dev = bdev->bd_dev; + bdput(bdev); + } + + return dev; +} +EXPORT_SYMBOL_GPL(dm_get_dev_t); + +/* * Add a device to the list, or just increment the usage count if * it's already present. */ @@ -372,23 +392,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result) { int r; - dev_t uninitialized_var(dev); + dev_t dev; struct dm_dev_internal *dd; struct dm_table *t = ti->table; - struct block_device *bdev; BUG_ON(!t); - /* convert the path to a device */ - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) { - dev = name_to_dev_t(path); - if (!dev) - return -ENODEV; - } else { - dev = bdev->bd_dev; - bdput(bdev); - } + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; dd = find_device(&t->devices, dev); if (!dd) { diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index c219a053c7f6..911ada643364 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1943,5 +1943,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) { - dm_tm_issue_prefetches(pmd->tm); + down_read(&pmd->root_lock); + if (!pmd->fail_io) + dm_tm_issue_prefetches(pmd->tm); + up_read(&pmd->root_lock); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 63903a5a5d9e..a1cc797fe88f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3453,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - cancel_delayed_work(&pool->waker); - cancel_delayed_work(&pool->no_space_timeout); + cancel_delayed_work_sync(&pool->waker); + cancel_delayed_work_sync(&pool->no_space_timeout); flush_workqueue(pool->wq); (void) commit(pool); } diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c new file mode 100644 index 000000000000..ad10d6d8ed28 --- /dev/null +++ b/drivers/md/dm-verity-fec.c @@ -0,0 +1,861 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen <samitolvanen@google.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include "dm-verity-fec.h" +#include <linux/math64.h> +#include <linux/sysfs.h> + +#define DM_MSG_PREFIX "verity-fec" + +/* + * If error correction has been configured, returns true. + */ +bool verity_fec_is_enabled(struct dm_verity *v) +{ + return v->fec && v->fec->dev; +} + +/* + * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable + * length fields. + */ +static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) +{ + return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io); +} + +/* + * Return an interleaved offset for a byte in RS block. + */ +static inline u64 fec_interleave(struct dm_verity *v, u64 offset) +{ + u32 mod; + + mod = do_div(offset, v->fec->rsn); + return offset + mod * (v->fec->rounds << v->data_dev_block_bits); +} + +/* + * Decode an RS block using Reed-Solomon. + */ +static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, + u8 *data, u8 *fec, int neras) +{ + int i; + uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + + for (i = 0; i < v->fec->roots; i++) + par[i] = fec[i]; + + return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, + fio->erasures, 0, NULL); +} + +/* + * Read error-correcting codes for the requested RS block. Returns a pointer + * to the data block. Caller is responsible for releasing buf. + */ +static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, + unsigned *offset, struct dm_buffer **buf) +{ + u64 position, block; + u8 *res; + + position = (index + rsb) * v->fec->roots; + block = position >> v->data_dev_block_bits; + *offset = (unsigned)(position - (block << v->data_dev_block_bits)); + + res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); + if (unlikely(IS_ERR(res))) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, (unsigned long long)rsb, + (unsigned long long)(v->fec->start + block), + PTR_ERR(res)); + *buf = NULL; + } + + return res; +} + +/* Loop over each preallocated buffer slot. */ +#define fec_for_each_prealloc_buffer(__i) \ + for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++) + +/* Loop over each extra buffer slot. */ +#define fec_for_each_extra_buffer(io, __i) \ + for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++) + +/* Loop over each allocated buffer. */ +#define fec_for_each_buffer(io, __i) \ + for (__i = 0; __i < (io)->nbufs; __i++) + +/* Loop over each RS block in each allocated buffer. */ +#define fec_for_each_buffer_rs_block(io, __i, __j) \ + fec_for_each_buffer(io, __i) \ + for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) + +/* + * Return a pointer to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline u8 *fec_buffer_rs_block(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned i, unsigned j) +{ + return &fio->bufs[i][j * v->fec->rsn]; +} + +/* + * Return an index to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j) +{ + return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; +} + +/* + * Decode all RS blocks from buffers and copy corrected bytes into fio->output + * starting from block_offset. + */ +static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, + u64 rsb, int byte_index, unsigned block_offset, + int neras) +{ + int r, corrected = 0, res; + struct dm_buffer *buf; + unsigned n, i, offset; + u8 *par, *block; + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (IS_ERR(par)) + return PTR_ERR(par); + + /* + * Decode the RS blocks we have in bufs. Each RS block results in + * one corrected target byte and consumes fec->roots parity bytes. + */ + fec_for_each_buffer_rs_block(fio, n, i) { + block = fec_buffer_rs_block(v, fio, n, i); + res = fec_decode_rs8(v, fio, block, &par[offset], neras); + if (res < 0) { + dm_bufio_release(buf); + + r = res; + goto error; + } + + corrected += res; + fio->output[block_offset] = block[byte_index]; + + block_offset++; + if (block_offset >= 1 << v->data_dev_block_bits) + goto done; + + /* read the next block when we run out of parity bytes */ + offset += v->fec->roots; + if (offset >= 1 << v->data_dev_block_bits) { + dm_bufio_release(buf); + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (unlikely(IS_ERR(par))) + return PTR_ERR(par); + } + } +done: + r = corrected; +error: + if (r < 0 && neras) + DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", + v->data_dev->name, (unsigned long long)rsb, r); + else if (r > 0) { + DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", + v->data_dev->name, (unsigned long long)rsb, r); + atomic_add_unless(&v->fec->corrected, 1, INT_MAX); + } + + return r; +} + +/* + * Locate data block erasures using verity hashes. + */ +static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, + u8 *want_digest, u8 *data) +{ + if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), + data, 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)))) + return 0; + + return memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) != 0; +} + +/* + * Read data blocks that are part of the RS block and deinterleave as much as + * fits into buffers. Check for erasure locations if @neras is non-NULL. + */ +static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, + u64 rsb, u64 target, unsigned block_offset, + int *neras) +{ + bool is_zero; + int i, j, target_index = -1; + struct dm_buffer *buf; + struct dm_bufio_client *bufio; + struct dm_verity_fec_io *fio = fec_io(io); + u64 block, ileaved; + u8 *bbuf, *rs_block; + u8 want_digest[v->digest_size]; + unsigned n, k; + + if (neras) + *neras = 0; + + /* + * read each of the rsn data blocks that are part of the RS block, and + * interleave contents to available bufs + */ + for (i = 0; i < v->fec->rsn; i++) { + ileaved = fec_interleave(v, rsb * v->fec->rsn + i); + + /* + * target is the data block we want to correct, target_index is + * the index of this block within the rsn RS blocks + */ + if (ileaved == target) + target_index = i; + + block = ileaved >> v->data_dev_block_bits; + bufio = v->fec->data_bufio; + + if (block >= v->data_blocks) { + block -= v->data_blocks; + + /* + * blocks outside the area were assumed to contain + * zeros when encoding data was generated + */ + if (unlikely(block >= v->fec->hash_blocks)) + continue; + + block += v->hash_start; + bufio = v->bufio; + } + + bbuf = dm_bufio_read(bufio, block, &buf); + if (unlikely(IS_ERR(bbuf))) { + DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", + v->data_dev->name, + (unsigned long long)rsb, + (unsigned long long)block, PTR_ERR(bbuf)); + + /* assume the block is corrupted */ + if (neras && *neras <= v->fec->roots) + fio->erasures[(*neras)++] = i; + + continue; + } + + /* locate erasures if the block is on the data device */ + if (bufio == v->fec->data_bufio && + verity_hash_for_block(v, io, block, want_digest, + &is_zero) == 0) { + /* skip known zero blocks entirely */ + if (is_zero) + continue; + + /* + * skip if we have already found the theoretical + * maximum number (i.e. fec->roots) of erasures + */ + if (neras && *neras <= v->fec->roots && + fec_is_erasure(v, io, want_digest, bbuf)) + fio->erasures[(*neras)++] = i; + } + + /* + * deinterleave and copy the bytes that fit into bufs, + * starting from block_offset + */ + fec_for_each_buffer_rs_block(fio, n, j) { + k = fec_buffer_rs_index(n, j) + block_offset; + + if (k >= 1 << v->data_dev_block_bits) + goto done; + + rs_block = fec_buffer_rs_block(v, fio, n, j); + rs_block[i] = bbuf[k]; + } +done: + dm_bufio_release(buf); + } + + return target_index; +} + +/* + * Allocate RS control structure and FEC buffers from preallocated mempools, + * and attempt to allocate as many extra buffers as available. + */ +static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + if (!fio->rs) { + fio->rs = mempool_alloc(v->fec->rs_pool, 0); + if (unlikely(!fio->rs)) { + DMERR("failed to allocate RS"); + return -ENOMEM; + } + } + + fec_for_each_prealloc_buffer(n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + if (unlikely(!fio->bufs[n])) { + DMERR("failed to allocate FEC buffer"); + return -ENOMEM; + } + } + + /* try to allocate the maximum number of buffers */ + fec_for_each_extra_buffer(fio, n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + /* we can manage with even one buffer if necessary */ + if (unlikely(!fio->bufs[n])) + break; + } + fio->nbufs = n; + + if (!fio->output) { + fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); + + if (!fio->output) { + DMERR("failed to allocate FEC page"); + return -ENOMEM; + } + } + + return 0; +} + +/* + * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are + * zeroed before deinterleaving. + */ +static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + fec_for_each_buffer(fio, n) + memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + + memset(fio->erasures, 0, sizeof(fio->erasures)); +} + +/* + * Decode all RS blocks in a single data block and return the target block + * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses + * hashes to locate erasures. + */ +static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 rsb, u64 offset, + bool use_erasures) +{ + int r, neras = 0; + unsigned pos; + + r = fec_alloc_bufs(v, fio); + if (unlikely(r < 0)) + return r; + + for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + fec_init_bufs(v, fio); + + r = fec_read_bufs(v, io, rsb, offset, pos, + use_erasures ? &neras : NULL); + if (unlikely(r < 0)) + return r; + + r = fec_decode_bufs(v, fio, rsb, r, pos, neras); + if (r < 0) + return r; + + pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + } + + /* Always re-validate the corrected block against the expected hash */ + r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)); + if (unlikely(r < 0)) + return r; + + if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), + v->digest_size)) { + DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", + v->data_dev->name, (unsigned long long)rsb, neras); + return -EILSEQ; + } + + return 0; +} + +static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, + size_t len) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + memcpy(data, &fio->output[fio->output_pos], len); + fio->output_pos += len; + + return 0; +} + +/* + * Correct errors in a block. Copies corrected block to dest if non-NULL, + * otherwise to a bio_vec starting from iter. + */ +int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + int r; + struct dm_verity_fec_io *fio = fec_io(io); + u64 offset, res, rsb; + + if (!verity_fec_is_enabled(v)) + return -EOPNOTSUPP; + + if (type == DM_VERITY_BLOCK_TYPE_METADATA) + block += v->data_blocks; + + /* + * For RS(M, N), the continuous FEC data is divided into blocks of N + * bytes. Since block size may not be divisible by N, the last block + * is zero padded when decoding. + * + * Each byte of the block is covered by a different RS(M, N) code, + * and each code is interleaved over N blocks to make it less likely + * that bursty corruption will leave us in unrecoverable state. + */ + + offset = block << v->data_dev_block_bits; + + res = offset; + div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + + /* + * The base RS block we can feed to the interleaver to find out all + * blocks required for decoding. + */ + rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); + + /* + * Locating erasures is slow, so attempt to recover the block without + * them first. Do a second attempt with erasures if the corruption is + * bad enough. + */ + r = fec_decode_rsb(v, io, fio, rsb, offset, false); + if (r < 0) { + r = fec_decode_rsb(v, io, fio, rsb, offset, true); + if (r < 0) + return r; + } + + if (dest) + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + else if (iter) { + fio->output_pos = 0; + r = verity_for_bv_block(v, io, iter, fec_bv_copy); + } + + return r; +} + +/* + * Clean up per-bio data. + */ +void verity_fec_finish_io(struct dm_verity_io *io) +{ + unsigned n; + struct dm_verity_fec *f = io->v->fec; + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + mempool_free(fio->rs, f->rs_pool); + + fec_for_each_prealloc_buffer(n) + mempool_free(fio->bufs[n], f->prealloc_pool); + + fec_for_each_extra_buffer(fio, n) + mempool_free(fio->bufs[n], f->extra_pool); + + mempool_free(fio->output, f->output_pool); +} + +/* + * Initialize per-bio data. + */ +void verity_fec_init_io(struct dm_verity_io *io) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + fio->rs = NULL; + memset(fio->bufs, 0, sizeof(fio->bufs)); + fio->nbufs = 0; + fio->output = NULL; +} + +/* + * Append feature arguments and values to the status table. + */ +unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen) +{ + if (!verity_fec_is_enabled(v)) + return sz; + + DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s " + DM_VERITY_OPT_FEC_BLOCKS " %llu " + DM_VERITY_OPT_FEC_START " %llu " + DM_VERITY_OPT_FEC_ROOTS " %d", + v->fec->dev->name, + (unsigned long long)v->fec->blocks, + (unsigned long long)v->fec->start, + v->fec->roots); + + return sz; +} + +void verity_fec_dtr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + struct kobject *kobj = &f->kobj_holder.kobj; + + if (!verity_fec_is_enabled(v)) + goto out; + + mempool_destroy(f->rs_pool); + mempool_destroy(f->prealloc_pool); + mempool_destroy(f->extra_pool); + kmem_cache_destroy(f->cache); + + if (f->data_bufio) + dm_bufio_client_destroy(f->data_bufio); + if (f->bufio) + dm_bufio_client_destroy(f->bufio); + + if (f->dev) + dm_put_device(v->ti, f->dev); + + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + +out: + kfree(f); + v->fec = NULL; +} + +static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct dm_verity *v = (struct dm_verity *)pool_data; + + return init_rs(8, 0x11d, 0, 1, v->fec->roots); +} + +static void fec_rs_free(void *element, void *pool_data) +{ + struct rs_control *rs = (struct rs_control *)element; + + if (rs) + free_rs(rs); +} + +bool verity_is_fec_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)); +} + +int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + unsigned *argc, const char *arg_name) +{ + int r; + struct dm_target *ti = v->ti; + const char *arg_value; + unsigned long long num_ll; + unsigned char num_c; + char dummy; + + if (!*argc) { + ti->error = "FEC feature arguments require a value"; + return -EINVAL; + } + + arg_value = dm_shift_arg(as); + (*argc)--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + if (r) { + ti->error = "FEC device lookup failed"; + return r; + } + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + v->fec->blocks = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_START; + return -EINVAL; + } + v->fec->start = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { + if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || + num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || + num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + v->fec->roots = num_c; + + } else { + ti->error = "Unrecognized verity FEC feature request"; + return -EINVAL; + } + + return 0; +} + +static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec, + kobj_holder.kobj); + + return sprintf(buf, "%d\n", atomic_read(&f->corrected)); +} + +static struct kobj_attribute attr_corrected = __ATTR_RO(corrected); + +static struct attribute *fec_attrs[] = { + &attr_corrected.attr, + NULL +}; + +static struct kobj_type fec_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = fec_attrs +}; + +/* + * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. + */ +int verity_fec_ctr_alloc(struct dm_verity *v) +{ + struct dm_verity_fec *f; + + f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL); + if (!f) { + v->ti->error = "Cannot allocate FEC structure"; + return -ENOMEM; + } + v->fec = f; + + return 0; +} + +/* + * Validate arguments and preallocate memory. Must be called after arguments + * have been parsed using verity_fec_parse_opt_args. + */ +int verity_fec_ctr(struct dm_verity *v) +{ + int r; + struct dm_verity_fec *f = v->fec; + struct dm_target *ti = v->ti; + struct mapped_device *md = dm_table_get_md(ti->table); + u64 hash_blocks; + + if (!verity_fec_is_enabled(v)) { + verity_fec_dtr(v); + return 0; + } + + /* Create a kobject and sysfs attributes */ + init_completion(&f->kobj_holder.completion); + + r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", "fec"); + if (r) { + ti->error = "Cannot create kobject"; + return r; + } + + /* + * FEC is computed over data blocks, possible metadata, and + * hash blocks. In other words, FEC covers total of fec_blocks + * blocks consisting of the following: + * + * data blocks | hash blocks | metadata (optional) + * + * We allow metadata after hash blocks to support a use case + * where all data is stored on the same device and FEC covers + * the entire area. + * + * If metadata is included, we require it to be available on the + * hash device after the hash blocks. + */ + + hash_blocks = v->hash_blocks - v->hash_start; + + /* + * Require matching block sizes for data and hash devices for + * simplicity. + */ + if (v->data_dev_block_bits != v->hash_dev_block_bits) { + ti->error = "Block sizes must match to use FEC"; + return -EINVAL; + } + + if (!f->roots) { + ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + f->rsn = DM_VERITY_FEC_RSM - f->roots; + + if (!f->blocks) { + ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + f->rounds = f->blocks; + if (sector_div(f->rounds, f->rsn)) + f->rounds++; + + /* + * Due to optional metadata, f->blocks can be larger than + * data_blocks and hash_blocks combined. + */ + if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + /* + * Metadata is accessed through the hash device, so we require + * it to be large enough. + */ + f->hash_blocks = f->blocks - v->data_blocks; + if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + ti->error = "Hash device is too small for " + DM_VERITY_OPT_FEC_BLOCKS; + return -E2BIG; + } + + f->bufio = dm_bufio_client_create(f->dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->bufio)) { + ti->error = "Cannot initialize FEC bufio client"; + return PTR_ERR(f->bufio); + } + + if (dm_bufio_get_device_size(f->bufio) < + ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) { + ti->error = "FEC device is too small"; + return -E2BIG; + } + + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->data_bufio)) { + ti->error = "Cannot initialize FEC data bufio client"; + return PTR_ERR(f->data_bufio); + } + + if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) { + ti->error = "Data device is too small"; + return -E2BIG; + } + + /* Preallocate an rs_control structure for each worker thread */ + f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc, + fec_rs_free, (void *) v); + if (!f->rs_pool) { + ti->error = "Cannot allocate RS pool"; + return -ENOMEM; + } + + f->cache = kmem_cache_create("dm_verity_fec_buffers", + f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + 0, 0, NULL); + if (!f->cache) { + ti->error = "Cannot create FEC buffer cache"; + return -ENOMEM; + } + + /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ + f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() * + DM_VERITY_FEC_BUF_PREALLOC, + f->cache); + if (!f->prealloc_pool) { + ti->error = "Cannot allocate FEC buffer prealloc pool"; + return -ENOMEM; + } + + f->extra_pool = mempool_create_slab_pool(0, f->cache); + if (!f->extra_pool) { + ti->error = "Cannot allocate FEC buffer extra pool"; + return -ENOMEM; + } + + /* Preallocate an output buffer for each thread */ + f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(), + 1 << v->data_dev_block_bits); + if (!f->output_pool) { + ti->error = "Cannot allocate FEC output pool"; + return -ENOMEM; + } + + /* Reserve space for our per-bio data */ + ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + + return 0; +} diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h new file mode 100644 index 000000000000..8c4bee052a73 --- /dev/null +++ b/drivers/md/dm-verity-fec.h @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen <samitolvanen@google.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#ifndef DM_VERITY_FEC_H +#define DM_VERITY_FEC_H + +#include "dm.h" +#include "dm-verity.h" +#include <linux/rslib.h> + +/* Reed-Solomon(M, N) parameters */ +#define DM_VERITY_FEC_RSM 255 +#define DM_VERITY_FEC_MAX_RSN 253 +#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ + +/* buffers for deinterleaving and decoding */ +#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +/* we need buffers for at most 1 << block size RS blocks */ +#define DM_VERITY_FEC_BUF_MAX \ + (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) + +#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" +#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" +#define DM_VERITY_OPT_FEC_START "fec_start" +#define DM_VERITY_OPT_FEC_ROOTS "fec_roots" + +/* configuration */ +struct dm_verity_fec { + struct dm_dev *dev; /* parity data device */ + struct dm_bufio_client *data_bufio; /* for data dev access */ + struct dm_bufio_client *bufio; /* for parity data access */ + sector_t start; /* parity data start in blocks */ + sector_t blocks; /* number of blocks covered */ + sector_t rounds; /* number of interleaving rounds */ + sector_t hash_blocks; /* blocks covered after v->hash_start */ + unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ + unsigned char rsn; /* N of RS(M, N) */ + mempool_t *rs_pool; /* mempool for fio->rs */ + mempool_t *prealloc_pool; /* mempool for preallocated buffers */ + mempool_t *extra_pool; /* mempool for extra buffers */ + mempool_t *output_pool; /* mempool for output */ + struct kmem_cache *cache; /* cache for buffers */ + atomic_t corrected; /* corrected errors */ + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ +}; + +/* per-bio data */ +struct dm_verity_fec_io { + struct rs_control *rs; /* Reed-Solomon state */ + int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ + unsigned nbufs; /* number of buffers allocated */ + u8 *output; /* buffer for corrected output */ + size_t output_pos; +}; + +#ifdef CONFIG_DM_VERITY_FEC + +/* each feature parameter requires a value */ +#define DM_VERITY_OPTS_FEC 8 + +extern bool verity_fec_is_enabled(struct dm_verity *v); + +extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, + u8 *dest, struct bvec_iter *iter); + +extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen); + +extern void verity_fec_finish_io(struct dm_verity_io *io); +extern void verity_fec_init_io(struct dm_verity_io *io); + +extern bool verity_is_fec_opt_arg(const char *arg_name); +extern int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, unsigned *argc, + const char *arg_name); + +extern void verity_fec_dtr(struct dm_verity *v); + +extern int verity_fec_ctr_alloc(struct dm_verity *v); +extern int verity_fec_ctr(struct dm_verity *v); + +#else /* !CONFIG_DM_VERITY_FEC */ + +#define DM_VERITY_OPTS_FEC 0 + +static inline bool verity_fec_is_enabled(struct dm_verity *v) +{ + return false; +} + +static inline int verity_fec_decode(struct dm_verity *v, + struct dm_verity_io *io, + enum verity_block_type type, + sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + return -EOPNOTSUPP; +} + +static inline unsigned verity_fec_status_table(struct dm_verity *v, + unsigned sz, char *result, + unsigned maxlen) +{ + return sz; +} + +static inline void verity_fec_finish_io(struct dm_verity_io *io) +{ +} + +static inline void verity_fec_init_io(struct dm_verity_io *io) +{ +} + +static inline bool verity_is_fec_opt_arg(const char *arg_name) +{ + return false; +} + +static inline int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + unsigned *argc, + const char *arg_name) +{ + return -EINVAL; +} + +static inline void verity_fec_dtr(struct dm_verity *v) +{ +} + +static inline int verity_fec_ctr_alloc(struct dm_verity *v) +{ + return 0; +} + +static inline int verity_fec_ctr(struct dm_verity *v) +{ + return 0; +} + +#endif /* CONFIG_DM_VERITY_FEC */ + +#endif /* DM_VERITY_FEC_H */ diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c index ccf41886ebcf..5c5d30cb6ec5 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity-target.c @@ -14,12 +14,11 @@ * access behavior. */ -#include "dm-bufio.h" +#include "dm-verity.h" +#include "dm-verity-fec.h" #include <linux/module.h> -#include <linux/device-mapper.h> #include <linux/reboot.h> -#include <crypto/hash.h> #define DM_MSG_PREFIX "verity" @@ -28,83 +27,18 @@ #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 -#define DM_VERITY_MAX_LEVELS 63 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" + +#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); -enum verity_mode { - DM_VERITY_MODE_EIO, - DM_VERITY_MODE_LOGGING, - DM_VERITY_MODE_RESTART -}; - -enum verity_block_type { - DM_VERITY_BLOCK_TYPE_DATA, - DM_VERITY_BLOCK_TYPE_METADATA -}; - -struct dm_verity { - struct dm_dev *data_dev; - struct dm_dev *hash_dev; - struct dm_target *ti; - struct dm_bufio_client *bufio; - char *alg_name; - struct crypto_shash *tfm; - u8 *root_digest; /* digest of the root block */ - u8 *salt; /* salt: its size is salt_size */ - unsigned salt_size; - sector_t data_start; /* data offset in 512-byte sectors */ - sector_t hash_start; /* hash start in blocks */ - sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ - unsigned char data_dev_block_bits; /* log2(data blocksize) */ - unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ - unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ - unsigned char levels; /* the number of tree levels */ - unsigned char version; - unsigned digest_size; /* digest size for the current hash algorithm */ - unsigned shash_descsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ - enum verity_mode mode; /* mode for handling verification errors */ - unsigned corrupted_errs;/* Number of errors for corrupted blocks */ - - struct workqueue_struct *verify_wq; - - /* starting blocks for each tree level. 0 is the lowest level. */ - sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; -}; - -struct dm_verity_io { - struct dm_verity *v; - - /* original values of bio->bi_end_io and bio->bi_private */ - bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; - - sector_t block; - unsigned n_blocks; - - struct bvec_iter iter; - - struct work_struct work; - - /* - * Three variably-size fields follow this struct: - * - * u8 hash_desc[v->shash_descsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). - */ -}; - struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -112,21 +46,6 @@ struct dm_verity_prefetch_work { unsigned n_blocks; }; -static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) -{ - return (struct shash_desc *)(io + 1); -} - -static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize; -} - -static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; -} - /* * Auxiliary structure appended to each dm-bufio buffer. If the value * hash_verified is nonzero, hash of the block has been verified. @@ -173,6 +92,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } +/* + * Wrapper for crypto_shash_init, which handles verity salting. + */ +static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) +{ + int r; + + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + return 0; +} + +static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len) +{ + int r = crypto_shash_update(desc, data, len); + + if (unlikely(r < 0)) + DMERR("crypto_shash_update failed: %d", r); + + return r; +} + +static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, + u8 *digest) +{ + int r; + + if (unlikely(!v->version)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + r = crypto_shash_final(desc, digest); + + if (unlikely(r < 0)) + DMERR("crypto_shash_final failed: %d", r); + + return r; +} + +int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest) +{ + int r; + + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) + return r; + + r = verity_hash_update(v, desc, data, len); + if (unlikely(r < 0)) + return r; + + return verity_hash_final(v, desc, digest); +} + static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, sector_t *hash_block, unsigned *offset) { @@ -246,17 +243,17 @@ out: * Verify hash of a metadata block pertaining to the specified data block * ("block" argument) at a specified level ("level" argument). * - * On successful return, io_want_digest(v, io) contains the hash value for - * a lower tree level or for the data block (if we're at the lowest leve). + * On successful return, verity_io_want_digest(v, io) contains the hash value + * for a lower tree level or for the data block (if we're at the lowest level). * * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. * If "skip_unverified" is false, unverified buffer is hashed and verified - * against current value of io_want_digest(v, io). + * against current value of verity_io_want_digest(v, io). */ -static int verity_verify_level(struct dm_verity_io *io, sector_t block, - int level, bool skip_unverified) +static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, int level, bool skip_unverified, + u8 *want_digest) { - struct dm_verity *v = io->v; struct dm_buffer *buf; struct buffer_aux *aux; u8 *data; @@ -273,72 +270,128 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block, aux = dm_bufio_get_aux_data(buf); if (!aux->hash_verified) { - struct shash_desc *desc; - u8 *result; - if (skip_unverified) { r = 1; goto release_ret_r; } - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash(v, verity_io_hash_desc(v, io), + data, 1 << v->hash_dev_block_bits, + verity_io_real_digest(v, io)); + if (unlikely(r < 0)) goto release_ret_r; - } - - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); + if (likely(memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) == 0)) + aux->hash_verified = 1; + else if (verity_fec_decode(v, io, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data, NULL) == 0) + aux->hash_verified = 1; + else if (verity_handle_err(v, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block)) { + r = -EIO; goto release_ret_r; } + } - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } + data += offset; + memcpy(want_digest, data, v->digest_size); + r = 0; - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); - goto release_ret_r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block)) { - r = -EIO; - goto release_ret_r; - } - } else - aux->hash_verified = 1; +release_ret_r: + dm_bufio_release(buf); + return r; +} + +/* + * Find a hash for a given block, write it to digest and verify the integrity + * of the hash tree if necessary. + */ +int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest, bool *is_zero) +{ + int r = 0, i; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 1 and we fall back to whole + * chain verification. + */ + r = verity_verify_level(v, io, block, 0, true, digest); + if (likely(r <= 0)) + goto out; } - data += offset; + memcpy(digest, v->root_digest, v->digest_size); - memcpy(io_want_digest(v, io), data, v->digest_size); + for (i = v->levels - 1; i >= 0; i--) { + r = verity_verify_level(v, io, block, i, false, digest); + if (unlikely(r)) + goto out; + } +out: + if (!r && v->zero_digest) + *is_zero = !memcmp(v->zero_digest, digest, v->digest_size); + else + *is_zero = false; + + return r; +} + +/* + * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec + * starting from iter. + */ +int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, u8 *data, + size_t len)) +{ + unsigned todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + + do { + int r; + u8 *page; + unsigned len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + page = kmap_atomic(bv.bv_page); + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + + r = process(v, io, page + bv.bv_offset, len); + kunmap_atomic(page); + + if (r < 0) + return r; + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); - dm_bufio_release(buf); return 0; +} -release_ret_r: - dm_bufio_release(buf); +static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); +} - return r; +static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memset(data, 0, len); + return 0; } /* @@ -346,99 +399,56 @@ release_ret_r: */ static int verity_verify_io(struct dm_verity_io *io) { + bool is_zero; struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, - v->ti->per_bio_data_size); + struct bvec_iter start; unsigned b; - int i; for (b = 0; b < io->n_blocks; b++) { - struct shash_desc *desc; - u8 *result; int r; - unsigned todo; + struct shash_desc *desc = verity_io_hash_desc(v, io); + + r = verity_hash_for_block(v, io, io->block + b, + verity_io_want_digest(v, io), + &is_zero); + if (unlikely(r < 0)) + return r; - if (likely(v->levels)) { + if (is_zero) { /* - * First, we try to get the requested hash for - * the current block. If the hash block itself is - * verified, zero is returned. If it isn't, this - * function returns 0 and we fall back to whole - * chain verification. + * If we expect a zero block, don't validate, just + * return zeros. */ - int r = verity_verify_level(io, io->block + b, 0, true); - if (likely(!r)) - goto test_block_hash; - if (r < 0) + r = verity_for_bv_block(v, io, &io->iter, + verity_bv_zero); + if (unlikely(r < 0)) return r; - } - memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); - - for (i = v->levels - 1; i >= 0; i--) { - int r = verity_verify_level(io, io->block + b, i, false); - if (unlikely(r)) - return r; + continue; } -test_block_hash: - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) return r; - } - - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - todo = 1 << v->data_dev_block_bits; - do { - u8 *page; - unsigned len; - struct bio_vec bv = bio_iter_iovec(bio, io->iter); - - page = kmap_atomic(bv.bv_page); - len = bv.bv_len; - if (likely(len >= todo)) - len = todo; - r = crypto_shash_update(desc, page + bv.bv_offset, len); - kunmap_atomic(page); - - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - - bio_advance_iter(bio, &io->iter, len); - todo -= len; - } while (todo); - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } + start = io->iter; + r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); + if (unlikely(r < 0)) + return r; - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); + r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); + if (unlikely(r < 0)) return r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) - return -EIO; - } + + if (likely(memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size) == 0)) + continue; + else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b, NULL, &start) == 0) + continue; + else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b)) + return -EIO; } return 0; @@ -453,9 +463,10 @@ static void verity_finish_io(struct dm_verity_io *io, int error) struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_private = io->orig_bi_private; bio->bi_error = error; + verity_fec_finish_io(io); + bio_endio(bio); } @@ -470,7 +481,7 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_error && !verity_fec_is_enabled(io->v)) { verity_finish_io(io, bio->bi_error); return; } @@ -566,7 +577,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, ti->per_bio_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; - io->orig_bi_private = bio->bi_private; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; @@ -574,6 +584,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_private = io; io->iter = bio->bi_iter; + verity_fec_init_io(io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -588,6 +600,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; + unsigned args = 0; unsigned sz = 0; unsigned x; @@ -614,8 +627,17 @@ static void verity_status(struct dm_target *ti, status_type_t type, else for (x = 0; x < v->salt_size; x++) DMEMIT("%02x", v->salt[x]); + if (v->mode != DM_VERITY_MODE_EIO) + args++; + if (verity_fec_is_enabled(v)) + args += DM_VERITY_OPTS_FEC; + if (v->zero_digest) + args++; + if (!args) + return; + DMEMIT(" %u", args); if (v->mode != DM_VERITY_MODE_EIO) { - DMEMIT(" 1 "); + DMEMIT(" "); switch (v->mode) { case DM_VERITY_MODE_LOGGING: DMEMIT(DM_VERITY_OPT_LOGGING); @@ -627,6 +649,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + if (v->zero_digest) + DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); + sz = verity_fec_status_table(v, sz, result, maxlen); break; } } @@ -677,6 +702,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->salt); kfree(v->root_digest); + kfree(v->zero_digest); if (v->tfm) crypto_free_shash(v->tfm); @@ -689,9 +715,94 @@ static void verity_dtr(struct dm_target *ti) if (v->data_dev) dm_put_device(ti, v->data_dev); + verity_fec_dtr(v); + kfree(v); } +static int verity_alloc_zero_digest(struct dm_verity *v) +{ + int r = -ENOMEM; + struct shash_desc *desc; + u8 *zero_data; + + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); + + if (!v->zero_digest) + return r; + + desc = kmalloc(v->shash_descsize, GFP_KERNEL); + + if (!desc) + return r; /* verity_dtr will free zero_digest */ + + zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); + + if (!zero_data) + goto out; + + r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, + v->zero_digest); + +out: + kfree(desc); + kfree(zero_data); + + return r; +} + +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) +{ + int r; + unsigned argc; + struct dm_target *ti = v->ti; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) { + v->mode = DM_VERITY_MODE_LOGGING; + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { + v->mode = DM_VERITY_MODE_RESTART; + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + r = verity_alloc_zero_digest(v); + if (r) { + ti->error = "Cannot allocate zero digest"; + return r; + } + continue; + + } else if (verity_is_fec_opt_arg(arg_name)) { + r = verity_fec_parse_opt_args(as, v, &argc, arg_name); + if (r) + return r; + continue; + } + + ti->error = "Unrecognized verity feature request"; + return -EINVAL; + } while (argc && !r); + + return r; +} + /* * Target parameters: * <version> The current format is version 1. @@ -710,18 +821,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; - const char *opt_string; - unsigned int num, opt_params; + unsigned int num; unsigned long long num_ll; int r; int i; sector_t hash_position; char dummy; - static struct dm_arg _args[] = { - {0, 1, "Invalid number of feature args"}, - }; - v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); if (!v) { ti->error = "Cannot allocate verity structure"; @@ -730,6 +836,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->private = v; v->ti = ti; + r = verity_fec_ctr_alloc(v); + if (r) + goto bad; + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { ti->error = "Device must be readonly"; r = -EINVAL; @@ -866,29 +976,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) as.argc = argc; as.argv = argv; - r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); - if (r) + r = verity_parse_opt_args(&as, v); + if (r < 0) goto bad; - - while (opt_params) { - opt_params--; - opt_string = dm_shift_arg(&as); - if (!opt_string) { - ti->error = "Not enough feature arguments"; - r = -EINVAL; - goto bad; - } - - if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING)) - v->mode = DM_VERITY_MODE_LOGGING; - else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART)) - v->mode = DM_VERITY_MODE_RESTART; - else { - ti->error = "Invalid feature arguments"; - r = -EINVAL; - goto bad; - } - } } v->hash_per_block_bits = @@ -938,8 +1028,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io)); - /* WQ_UNBOUND greatly improves performance when running on ramdisk */ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!v->verify_wq) { @@ -948,6 +1036,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + ti->per_bio_data_size = sizeof(struct dm_verity_io) + + v->shash_descsize + v->digest_size * 2; + + r = verity_fec_ctr(v); + if (r) + goto bad; + + ti->per_bio_data_size = roundup(ti->per_bio_data_size, + __alignof__(struct dm_verity_io)); + return 0; bad: @@ -958,7 +1056,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h new file mode 100644 index 000000000000..fb419f422d73 --- /dev/null +++ b/drivers/md/dm-verity.h @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2015 Google, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + */ + +#ifndef DM_VERITY_H +#define DM_VERITY_H + +#include "dm-bufio.h" +#include <linux/device-mapper.h> +#include <crypto/hash.h> + +#define DM_VERITY_MAX_LEVELS 63 + +enum verity_mode { + DM_VERITY_MODE_EIO, + DM_VERITY_MODE_LOGGING, + DM_VERITY_MODE_RESTART +}; + +enum verity_block_type { + DM_VERITY_BLOCK_TYPE_DATA, + DM_VERITY_BLOCK_TYPE_METADATA +}; + +struct dm_verity_fec; + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + u8 *zero_digest; /* digest for a zero block */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + enum verity_mode mode; /* mode for handling verification errors */ + unsigned corrupted_errs;/* Number of errors for corrupted blocks */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; + + struct dm_verity_fec *fec; /* forward error correction */ +}; + +struct dm_verity_io { + struct dm_verity *v; + + /* original value of bio->bi_end_io */ + bio_end_io_t *orig_bi_end_io; + + sector_t block; + unsigned n_blocks; + + struct bvec_iter iter; + + struct work_struct work; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: verity_io_hash_desc(), verity_io_real_digest() + * and verity_io_want_digest(). + */ +}; + +static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static inline u8 *verity_io_real_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static inline u8 *verity_io_want_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +static inline u8 *verity_io_digest_end(struct dm_verity *v, + struct dm_verity_io *io) +{ + return verity_io_want_digest(v, io) + v->digest_size; +} + +extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, + u8 *data, size_t len)); + +extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest); + +extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest, bool *is_zero); + +#endif /* DM_VERITY_H */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5df40480228b..c338aebb4ccd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1109,12 +1109,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) { - if (md->queue->mq_ops) - blk_mq_run_hw_queues(md->queue, true); - else - blk_run_queue_async(md->queue); - } + if (!md->queue->mq_ops && run_queue) + blk_run_queue_async(md->queue); /* * dm_put() must be at the end of this function. See the comment above @@ -1191,6 +1187,8 @@ static void dm_unprep_request(struct request *rq) if (clone) free_rq_clone(clone); + else if (!tio->md->queue->mq_ops) + free_rq_tio(tio); } /* @@ -1212,9 +1210,9 @@ static void dm_requeue_original_request(struct mapped_device *md, { int rw = rq_data_dir(rq); + rq_end_stats(md, rq); dm_unprep_request(rq); - rq_end_stats(md, rq); if (!rq->q->mq_ops) old_requeue_request(rq); else { @@ -1334,7 +1332,10 @@ static void dm_complete_request(struct request *rq, int error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_complete_request(rq); + if (!rq->q->mq_ops) + blk_complete_request(rq); + else + blk_mq_complete_request(rq, error); } /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 61aacab424cf..c57fdf847b47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -293,6 +293,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) * go away inside make_request */ sectors = bio_sectors(bio); + /* bio could be mergeable after passing to underlayer */ + bio->bi_rw &= ~REQ_NOMERGE; mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); @@ -2017,28 +2019,32 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +/* + * Attempt to add an rdev, but only if it is consistent with the current + * integrity profile + */ +int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_rdev; struct blk_integrity *bi_mddev; + char name[BDEVNAME_SIZE]; if (!mddev->gendisk) - return; + return 0; bi_rdev = bdev_get_integrity(rdev->bdev); bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - WARN_ON_ONCE(!mddev->suspended); - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); + return 0; + + if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { + printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", + mdname(mddev), bdevname(rdev->bdev, name)); + return -ENXIO; + } + + return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca0b643fe3c1..dfa57b41541b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -657,7 +657,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7331a80d89f1..dd483bb2e111 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -129,7 +129,9 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) } multipath = conf->multipaths + mp_bh->path; - mp_bh->bio = *bio; + bio_init(&mp_bh->bio); + __bio_clone_fast(&mp_bh->bio, bio); + mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; @@ -257,6 +259,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; @@ -264,9 +269,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); break; } diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index fca6dbcf9a47..7e44005595c1 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -152,12 +152,9 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) static int brb_pop(struct bop_ring_buffer *brb) { - struct block_op *bop; - if (brb_empty(brb)) return -ENODATA; - bop = brb->bops + brb->begin; brb->begin = brb_next(brb, brb->begin); return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2169ff6e0f0..515554c7365b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); @@ -2274,6 +2274,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; spin_unlock_irq(&conf->device_lock); md_wakeup_thread(conf->mddev->thread); } else { @@ -2391,8 +2392,10 @@ static void raid1d(struct md_thread *thread) LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { - list_add(&tmp, &conf->bio_end_io_list); - list_del_init(&conf->bio_end_io_list); + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } } spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 96c1f6c3eac9..c67e11882278 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1787,6 +1787,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1828,9 +1831,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2753,6 +2753,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; spin_unlock_irq(&conf->device_lock); md_wakeup_thread(conf->mddev->thread); } else { @@ -2780,8 +2781,10 @@ static void raid10d(struct md_thread *thread) LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { - list_add(&tmp, &conf->bio_end_io_list); - list_del_init(&conf->bio_end_io_list); + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } } spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 704ef7fcfbf8..10ce885445f6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -340,8 +340,7 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - unsigned long do_wakeup = 0; - int i = 0; + bool do_wakeup = false; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -362,19 +361,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(&conf->empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup |= 1 << hash; + do_wakeup = true; spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } - for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { - if (do_wakeup & (1 << i)) - wake_up(&conf->wait_for_stripe[i]); - } - if (do_wakeup) { + wake_up(&conf->wait_for_stripe); if (atomic_read(&conf->active_stripes) == 0) wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned) @@ -687,15 +682,14 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - wait_event_exclusive_cmd( - conf->wait_for_stripe[hash], + wait_event_lock_irq( + conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), - spin_unlock_irq(conf->hash_locks + hash), - spin_lock_irq(conf->hash_locks + hash)); + *(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); } else { @@ -720,9 +714,6 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); - if (!list_empty(conf->inactive_list + hash)) - wake_up(&conf->wait_for_stripe[hash]); - spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -2091,6 +2082,14 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) unsigned long cpu; int err = 0; + /* + * Never shrink. And mddev_suspend() could deadlock if this is called + * from raid5d. In that case, scribble_disks and scribble_sectors + * should equal to new_disks and new_sectors + */ + if (conf->scribble_disks >= new_disks && + conf->scribble_sectors >= new_sectors) + return 0; mddev_suspend(conf->mddev); get_online_cpus(); for_each_present_cpu(cpu) { @@ -2112,6 +2111,10 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) } put_online_cpus(); mddev_resume(conf->mddev); + if (!err) { + conf->scribble_disks = new_disks; + conf->scribble_sectors = new_sectors; + } return err; } @@ -2192,7 +2195,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_exclusive_cmd(conf->wait_for_stripe[hash], + wait_event_cmd(conf->wait_for_stripe, !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -4238,7 +4241,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | (1 << STRIPE_SYNCING) | (1 << STRIPE_REPLACED) | - (1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_DELAYED) | (1 << STRIPE_BIT_DELAY) | (1 << STRIPE_FULL_WRITE) | @@ -4253,6 +4255,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_REPLACED))); set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | + (1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_DEGRADED)), head_sh->state & (1 << STRIPE_INSYNC)); @@ -6414,6 +6417,12 @@ static int raid5_alloc_percpu(struct r5conf *conf) } put_online_cpus(); + if (!err) { + conf->scribble_disks = max(conf->raid_disks, + conf->previous_raid_disks); + conf->scribble_sectors = max(conf->chunk_sectors, + conf->prev_chunk_sectors); + } return err; } @@ -6504,9 +6513,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) seqcount_init(&conf->gen_lock); mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); - for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { - init_waitqueue_head(&conf->wait_for_stripe[i]); - } + init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); @@ -7015,8 +7022,8 @@ static int run(struct mddev *mddev) } if (discard_supported && - mddev->queue->limits.max_discard_sectors >= stripe && - mddev->queue->limits.discard_granularity >= stripe) + mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && + mddev->queue->limits.discard_granularity >= stripe) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); else diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a415e1cd39b8..517d4b68a1be 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -510,6 +510,8 @@ struct r5conf { * conversions */ } __percpu *percpu; + int scribble_disks; + int scribble_sectors; #ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif @@ -522,7 +524,7 @@ struct r5conf { atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; + wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, |