diff options
Diffstat (limited to 'drivers/md')
43 files changed, 714 insertions, 480 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index aebb7ef10e63..5a79bb3c272f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -275,7 +275,7 @@ struct bcache_device { int (*cache_miss)(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors); - int (*ioctl)(struct bcache_device *d, fmode_t mode, + int (*ioctl)(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg); }; @@ -1004,11 +1004,11 @@ extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; -extern struct kobj_type bch_cached_dev_ktype; -extern struct kobj_type bch_flash_dev_ktype; -extern struct kobj_type bch_cache_set_ktype; -extern struct kobj_type bch_cache_set_internal_ktype; -extern struct kobj_type bch_cache_ktype; +extern const struct kobj_type bch_cached_dev_ktype; +extern const struct kobj_type bch_flash_dev_ktype; +extern const struct kobj_type bch_cache_set_ktype; +extern const struct kobj_type bch_cache_set_internal_ktype; +extern const struct kobj_type bch_cache_ktype; void bch_cached_dev_release(struct kobject *kobj); void bch_flash_dev_release(struct kobject *kobj); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 569f48958bde..fd121a61f17c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -906,7 +906,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -static void bch_cannibalize_unlock(struct cache_set *c) +void bch_cannibalize_unlock(struct cache_set *c) { spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { @@ -1111,10 +1111,12 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, struct btree *parent) { BKEY_PADDED(key) k; - struct btree *b = ERR_PTR(-EAGAIN); + struct btree *b; mutex_lock(&c->bucket_lock); retry: + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; @@ -1159,7 +1161,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); @@ -1361,7 +1363,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) + while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; @@ -1373,7 +1375,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); - if (IS_ERR_OR_NULL(new_nodes[i])) + if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } @@ -1508,7 +1510,7 @@ out_nocoalesce: bch_keylist_free(&keylist); for (i = 0; i < nodes; i++) - if (!IS_ERR_OR_NULL(new_nodes[i])) { + if (!IS_ERR(new_nodes[i])) { btree_node_free(new_nodes[i]); rw_unlock(true, new_nodes[i]); } @@ -1690,7 +1692,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); @@ -1989,6 +1991,15 @@ static int bch_btree_check_thread(void *arg) c->gc_stats.nodes++; bch_btree_op_init(&op, 0); ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 17b1d201cec0..45d64b54115a 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -282,6 +282,7 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +void bch_cannibalize_unlock(struct cache_set *c); static inline void wake_up_gc(struct cache_set *c) { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 67a2e29e0b40..a9b1f3896249 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1228,7 +1228,7 @@ void cached_dev_submit_bio(struct bio *bio) detached_dev_do_request(d, bio, orig_bdev, start_time); } -static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1318,7 +1318,7 @@ void flash_dev_submit_bio(struct bio *bio) continue_at(cl, search_free, NULL); } -static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index bd3afc856d53..21b445f8af15 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -18,7 +18,6 @@ struct cache_stats { unsigned long cache_misses; unsigned long cache_bypass_hits; unsigned long cache_bypass_misses; - unsigned long cache_readaheads; unsigned long cache_miss_collisions; unsigned long sectors_bypassed; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7e9d19fd21dd..e2a803683105 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -732,9 +732,9 @@ out: /* Bcache device */ -static int open_dev(struct block_device *b, fmode_t mode) +static int open_dev(struct gendisk *disk, blk_mode_t mode) { - struct bcache_device *d = b->bd_disk->private_data; + struct bcache_device *d = disk->private_data; if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; @@ -743,14 +743,14 @@ static int open_dev(struct block_device *b, fmode_t mode) return 0; } -static void release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b) { struct bcache_device *d = b->private_data; closure_put(&d->cl); } -static int ioctl_dev(struct block_device *b, fmode_t mode, +static int ioctl_dev(struct block_device *b, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct bcache_device *d = b->bd_disk->private_data; @@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl) put_page(virt_to_page(dc->sb_disk)); if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(dc->bdev, bcache_kobj); wake_up(&unregister_wait); @@ -1723,7 +1723,7 @@ static void cache_set_flush(struct closure *cl) if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); - if (!IS_ERR_OR_NULL(c->root)) + if (!IS_ERR(c->root)) list_add(&c->root->list, &c->btree_cache); /* @@ -2087,7 +2087,7 @@ static int run_cache_set(struct cache_set *c) err = "cannot allocate new btree root"; c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; mutex_lock(&c->root->write_lock); @@ -2218,7 +2218,7 @@ void bch_cache_release(struct kobject *kobj) put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(ca->bdev, bcache_kobj); kfree(ca); module_put(THIS_MODULE); @@ -2359,7 +2359,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, * call blkdev_put() to bdev in bch_cache_release(). So we * explicitly call blkdev_put() here. */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, bcache_kobj); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) @@ -2461,7 +2461,7 @@ static void register_bdev_worker(struct work_struct *work) if (!dc) { fail = true; put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(args->bdev, bcache_kobj); goto out; } @@ -2491,7 +2491,7 @@ static void register_cache_worker(struct work_struct *work) if (!ca) { fail = true; put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(args->bdev, bcache_kobj); goto out; } @@ -2558,9 +2558,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), - FMODE_READ|FMODE_WRITE|FMODE_EXCL, - sb); + bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE, + bcache_kobj, NULL); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { dev_t dev; @@ -2648,7 +2647,7 @@ async_done: out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, register_bcache); out_free_sb: kfree(sb); out_free_path: diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6f677059214..0e2c1880f60b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1111,26 +1111,25 @@ SHOW(__bch_cache) vfree(p); - ret = scnprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Clean: %zu%%\n" - "Dirty: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - available * 100 / (size_t) ca->sb.nbuckets, - dirty * 100 / (size_t) ca->sb.nbuckets, - meta * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + ret = sysfs_emit(buf, + "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); + ret += sysfs_emit_at(buf, ret, "%u ", q[i]); ret--; - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + ret += sysfs_emit_at(buf, ret, "]\n"); return ret; } diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index a2ff6447b699..65b8bd975ab1 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -3,7 +3,7 @@ #define _BCACHE_SYSFS_H_ #define KTYPE(type) \ -struct kobj_type type ## _ktype = { \ +const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &((const struct sysfs_ops) { \ .show = type ## _show, \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d4a5fc0650bb..24c049067f61 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -890,6 +890,16 @@ static int bch_root_node_dirty_init(struct cache_set *c, if (ret < 0) pr_warn("sectors dirty init failed, ret=%d!\n", ret); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + return ret; } diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 872896218550..911f73f7ebba 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2051,8 +2051,8 @@ static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->metadata_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); if (r) { *error = "Error opening metadata device"; return r; @@ -2074,8 +2074,8 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->cache_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); if (r) { *error = "Error opening cache device"; return r; @@ -2093,8 +2093,8 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->origin_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); if (r) { *error = "Error opening origin device"; return r; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index f467cdb5a022..94b2fc33f64b 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1683,8 +1683,8 @@ static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char * int r; sector_t metadata_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &clone->metadata_dev); + r = dm_get_device(clone->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->metadata_dev); if (r) { *error = "Error opening metadata device"; return r; @@ -1703,8 +1703,8 @@ static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **err int r; sector_t dest_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &clone->dest_dev); + r = dm_get_device(clone->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->dest_dev); if (r) { *error = "Error opening destination device"; return r; @@ -1725,7 +1725,7 @@ static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **e int r; sector_t source_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, + r = dm_get_device(clone->ti, dm_shift_arg(as), BLK_OPEN_READ, &clone->source_dev); if (r) { *error = "Error opening source device"; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index aecab0c0720f..ce913ad91a52 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -207,11 +207,10 @@ struct dm_table { unsigned integrity_added:1; /* - * Indicates the rw permissions for the new logical - * device. This should be a combination of FMODE_READ - * and FMODE_WRITE. + * Indicates the rw permissions for the new logical device. This + * should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE. */ - fmode_t mode; + blk_mode_t mode; /* a list of devices used by this table */ struct list_head devices; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8b47b913ee83..09e37ebf7cc8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1693,8 +1693,7 @@ retry: len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - bio_add_page(clone, page, len, 0); - + __bio_add_page(clone, page, len, 0); remaining_size -= len; } diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 0d70914217ee..6acfa5bf97a4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1482,14 +1482,16 @@ static int era_ctr(struct dm_target *ti, unsigned int argc, char **argv) era->ti = ti; - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); + r = dm_get_device(ti, argv[0], BLK_OPEN_READ | BLK_OPEN_WRITE, + &era->metadata_dev); if (r) { ti->error = "Error opening metadata device"; era_destroy(era); return -EINVAL; } - r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, + &era->origin_dev); if (r) { ti->error = "Error opening data device"; era_destroy(era); diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index d369457dbed0..2a71bcdba92d 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -293,8 +293,10 @@ static int __init dm_init_init(void) for (i = 0; i < ARRAY_SIZE(waitfor); i++) { if (waitfor[i]) { + dev_t dev; + DMINFO("waiting for device %s ...", waitfor[i]); - while (!dm_get_dev_t(waitfor[i])) + while (early_lookup_bdev(waitfor[i], &dev)) fsleep(5000); } } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index cc77cf3d4109..6d301019e5e3 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -861,7 +861,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) table = dm_get_inactive_table(md, &srcu_idx); if (table) { - if (!(dm_table_get_mode(table) & FMODE_WRITE)) + if (!(dm_table_get_mode(table) & BLK_OPEN_WRITE)) param->flags |= DM_READONLY_FLAG; param->target_count = table->num_targets; } @@ -1168,13 +1168,10 @@ static int do_resume(struct dm_ioctl *param) /* Do we need to load a new map ? */ if (new_map) { sector_t old_size, new_size; - int srcu_idx; /* Suspend if it isn't already suspended */ - old_map = dm_get_live_table(md, &srcu_idx); - if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map) + if (param->flags & DM_SKIP_LOCKFS_FLAG) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; - dm_put_live_table(md, srcu_idx); if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended_md(md)) @@ -1192,7 +1189,7 @@ static int do_resume(struct dm_ioctl *param) if (old_size && new_size && old_size != new_size) need_resize_uevent = true; - if (dm_table_get_mode(new_map) & FMODE_WRITE) + if (dm_table_get_mode(new_map) & BLK_OPEN_WRITE) set_disk_ro(dm_disk(md), 0); else set_disk_ro(dm_disk(md), 1); @@ -1381,12 +1378,12 @@ static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_ return 0; } -static inline fmode_t get_mode(struct dm_ioctl *param) +static inline blk_mode_t get_mode(struct dm_ioctl *param) { - fmode_t mode = FMODE_READ | FMODE_WRITE; + blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE; if (param->flags & DM_READONLY_FLAG) - mode = FMODE_READ; + mode = BLK_OPEN_READ; return mode; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c8821fcb8299..8846bf510a35 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, * canceling read-auto mode */ mddev->ro = 0; - if (!mddev->suspended && mddev->sync_thread) + if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!mddev->suspended && mddev->thread) + if (!mddev->suspended) md_wakeup_thread(mddev->thread); return 0; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9c49f53760d0..bf7a574499a3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1241,9 +1241,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; - dev_t origin_dev, cow_dev; unsigned int args_used, num_flush_bios = 1; - fmode_t origin_mode = FMODE_READ; + blk_mode_t origin_mode = BLK_OPEN_READ; if (argc < 4) { ti->error = "requires 4 or more arguments"; @@ -1253,7 +1252,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (dm_target_is_snapshot_merge(ti)) { num_flush_bios = 2; - origin_mode = FMODE_WRITE; + origin_mode = BLK_OPEN_WRITE; } s = kzalloc(sizeof(*s), GFP_KERNEL); @@ -1279,24 +1278,21 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get origin device"; goto bad_origin; } - origin_dev = s->origin->bdev->bd_dev; cow_path = argv[0]; argv++; argc--; - cow_dev = dm_get_dev_t(cow_path); - if (cow_dev && cow_dev == origin_dev) { - ti->error = "COW device cannot be the same as origin device"; - r = -EINVAL; - goto bad_cow; - } - r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; goto bad_cow; } + if (s->cow->bdev && s->cow->bdev == s->origin->bdev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_store; + } r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); if (r) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1398f1d6e83e..7d208b2b1a19 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -126,7 +126,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num) return 0; } -int dm_table_create(struct dm_table **result, fmode_t mode, +int dm_table_create(struct dm_table **result, blk_mode_t mode, unsigned int num_targets, struct mapped_device *md) { struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); @@ -304,7 +304,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * device and not to touch the existing bdev field in case * it is accessed concurrently. */ -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, +static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode, struct mapped_device *md) { int r; @@ -324,23 +324,13 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, } /* - * Convert the path to a device - */ -dev_t dm_get_dev_t(const char *path) -{ - dev_t dev; - - if (lookup_bdev(path, &dev)) - dev = name_to_dev_t(path); - return dev; -} -EXPORT_SYMBOL_GPL(dm_get_dev_t); - -/* * Add a device to the list, or just increment the usage count if * it's already present. + * + * Note: the __ref annotation is because this function can call the __init + * marked early_lookup_bdev when called during early boot code from dm-init.c. */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, +int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, struct dm_dev **result) { int r; @@ -358,9 +348,13 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (MAJOR(dev) != major || MINOR(dev) != minor) return -EOVERFLOW; } else { - dev = dm_get_dev_t(path); - if (!dev) - return -ENODEV; + r = lookup_bdev(path, &dev); +#ifndef MODULE + if (r && system_state < SYSTEM_RUNNING) + r = early_lookup_bdev(path, &dev); +#endif + if (r) + return r; } if (dev == disk_devt(t->md->disk)) return -EINVAL; @@ -668,7 +662,8 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->singleton = true; } - if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { + if (dm_target_always_writeable(ti->type) && + !(t->mode & BLK_OPEN_WRITE)) { ti->error = "target type may not be included in a read-only table"; goto bad; } @@ -2039,7 +2034,7 @@ struct list_head *dm_table_get_devices(struct dm_table *t) return &t->devices; } -fmode_t dm_table_get_mode(struct dm_table *t) +blk_mode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..9dd0409848ab 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1756,13 +1756,15 @@ int dm_thin_remove_range(struct dm_thin_device *td, int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) { - int r; + int r = -EINVAL; uint32_t ref_count; down_read(&pmd->root_lock); - r = dm_sm_get_count(pmd->data_sm, b, &ref_count); - if (!r) - *result = (ref_count > 1); + if (!pmd->fail_io) { + r = dm_sm_get_count(pmd->data_sm, b, &ref_count); + if (!r) + *result = (ref_count > 1); + } up_read(&pmd->root_lock); return r; @@ -1770,10 +1772,11 @@ int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *re int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) { - int r = 0; + int r = -EINVAL; pmd_write_lock(pmd); - r = dm_sm_inc_blocks(pmd->data_sm, b, e); + if (!pmd->fail_io) + r = dm_sm_inc_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; @@ -1781,10 +1784,11 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) { - int r = 0; + int r = -EINVAL; pmd_write_lock(pmd); - r = dm_sm_dec_blocks(pmd->data_sm, b, e); + if (!pmd->fail_io) + r = dm_sm_dec_blocks(pmd->data_sm, b, e); pmd_write_unlock(pmd); return r; @@ -1887,7 +1891,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2b13c949bd72..f1d0dcb9db22 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -401,8 +401,7 @@ static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t da sector_t s = block_to_sectors(tc->pool, data_b); sector_t len = block_to_sectors(tc->pool, data_e - data_b); - return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOWAIT, - &op->bio); + return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio); } static void end_discard(struct discard_op *op, int r) @@ -3301,7 +3300,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned long block_size; dm_block_t low_water_blocks; struct dm_dev *metadata_dev; - fmode_t metadata_mode; + blk_mode_t metadata_mode; /* * FIXME Remove validation from scope of lock. @@ -3334,7 +3333,8 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto out_unlock; - metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + metadata_mode = BLK_OPEN_READ | + ((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE); r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); if (r) { ti->error = "Error opening metadata block device"; @@ -3342,7 +3342,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) } warn_if_metadata_device_too_big(metadata_dev->bdev); - r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, &data_dev); if (r) { ti->error = "Error getting data device"; goto out_metadata; @@ -4223,7 +4223,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_origin_dev; } - r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev); if (r) { ti->error = "Error opening origin device"; goto bad_origin_dev; diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a9ee2faa75a2..3ef9f018da60 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -607,7 +607,7 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { - r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e35c16e06d06..26adcfea0302 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1196,7 +1196,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + if ((dm_table_get_mode(ti->table) & ~BLK_OPEN_READ)) { ti->error = "Device must be readonly"; r = -EINVAL; goto bad; @@ -1225,13 +1225,13 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->version = num; - r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ, &v->data_dev); if (r) { ti->error = "Data device lookup failed"; goto bad; } - r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &v->hash_dev); if (r) { ti->error = "Hash device lookup failed"; goto bad; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 8f0896a6990b..9d3cca8e3dc9 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -577,7 +577,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, bio->bi_iter.bi_sector = dmz_blk2sect(block); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); return mblk; @@ -728,7 +728,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, bio->bi_iter.bi_sector = dmz_blk2sect(block); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); return 0; @@ -752,7 +752,7 @@ static int dmz_rdwr_block(struct dmz_dev *dev, enum req_op op, bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO, GFP_NOIO); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3b694ba3a106..ae0a88defb41 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -310,13 +310,13 @@ int dm_deleting_md(struct mapped_device *md) return test_bit(DMF_DELETING, &md->flags); } -static int dm_blk_open(struct block_device *bdev, fmode_t mode) +static int dm_blk_open(struct gendisk *disk, blk_mode_t mode) { struct mapped_device *md; spin_lock(&_minor_lock); - md = bdev->bd_disk->private_data; + md = disk->private_data; if (!md) goto out; @@ -334,7 +334,7 @@ out: return md ? 0 : -ENXIO; } -static void dm_blk_close(struct gendisk *disk, fmode_t mode) +static void dm_blk_close(struct gendisk *disk) { struct mapped_device *md; @@ -448,7 +448,7 @@ static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) dm_put_live_table(md, srcu_idx); } -static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, +static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -734,7 +734,7 @@ static char *_dm_claim_ptr = "I belong to device-mapper"; * Open a table device so we can use it as a map destination. */ static struct table_device *open_table_device(struct mapped_device *md, - dev_t dev, fmode_t mode) + dev_t dev, blk_mode_t mode) { struct table_device *td; struct block_device *bdev; @@ -746,7 +746,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); - bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr); + bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL); if (IS_ERR(bdev)) { r = PTR_ERR(bdev); goto out_free_td; @@ -771,7 +771,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return td; out_blkdev_put: - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, _dm_claim_ptr); out_free_td: kfree(td); return ERR_PTR(r); @@ -784,14 +784,14 @@ static void close_table_device(struct table_device *td, struct mapped_device *md { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + blkdev_put(td->dm_dev.bdev, _dm_claim_ptr); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) + blk_mode_t mode) { struct table_device *td; @@ -802,7 +802,7 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, return NULL; } -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode, struct dm_dev **result) { struct table_device *td; @@ -1172,7 +1172,8 @@ static inline sector_t max_io_len_target_boundary(struct dm_target *ti, } static sector_t __max_io_len(struct dm_target *ti, sector_t sector, - unsigned int max_granularity) + unsigned int max_granularity, + unsigned int max_sectors) { sector_t target_offset = dm_target_offset(ti, sector); sector_t len = max_io_len_target_boundary(ti, target_offset); @@ -1186,13 +1187,13 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, if (!max_granularity) return len; return min_t(sector_t, len, - min(queue_max_sectors(ti->table->md->queue), + min(max_sectors ? : queue_max_sectors(ti->table->md->queue), blk_chunk_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) { - return __max_io_len(ti, sector, ti->max_io_len); + return __max_io_len(ti, sector, ti->max_io_len, 0); } int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) @@ -1581,12 +1582,13 @@ static void __send_empty_flush(struct clone_info *ci) static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, - unsigned int max_granularity) + unsigned int max_granularity, + unsigned int max_sectors) { unsigned int len, bios; len = min_t(sector_t, ci->sector_count, - __max_io_len(ti, ci->sector, max_granularity)); + __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, num_bios, &len); @@ -1623,23 +1625,27 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, { unsigned int num_bios = 0; unsigned int max_granularity = 0; + unsigned int max_sectors = 0; struct queue_limits *limits = dm_get_queue_limits(ti->table->md); switch (bio_op(ci->bio)) { case REQ_OP_DISCARD: num_bios = ti->num_discard_bios; + max_sectors = limits->max_discard_sectors; if (ti->max_discard_granularity) - max_granularity = limits->max_discard_sectors; + max_granularity = max_sectors; break; case REQ_OP_SECURE_ERASE: num_bios = ti->num_secure_erase_bios; + max_sectors = limits->max_secure_erase_sectors; if (ti->max_secure_erase_granularity) - max_granularity = limits->max_secure_erase_sectors; + max_granularity = max_sectors; break; case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; + max_sectors = limits->max_write_zeroes_sectors; if (ti->max_write_zeroes_granularity) - max_granularity = limits->max_write_zeroes_sectors; + max_granularity = max_sectors; break; default: break; @@ -1654,7 +1660,8 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci, if (unlikely(!num_bios)) return BLK_STS_NOTSUPP; - __send_changing_extent_only(ci, ti, num_bios, max_granularity); + __send_changing_extent_only(ci, ti, num_bios, + max_granularity, max_sectors); return BLK_STS_OK; } @@ -2808,6 +2815,10 @@ retry: } map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map) { + /* avoid deadlock with fs/namespace.c:do_mount() */ + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + } r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); if (r) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a856e0aee73b..63d9010d8e61 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -203,7 +203,7 @@ int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode, struct dm_dev **result); void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 91836e6de326..6eaa0eab40f9 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -147,7 +147,8 @@ static void __init md_setup_drive(struct md_setup_args *args) if (p) *p++ = 0; - dev = name_to_dev_t(devname); + if (early_lookup_bdev(devname, &dev)) + dev = 0; if (strncmp(devname, "/dev/", 5) == 0) devname += 5; snprintf(comp_name, 63, "/dev/%s", devname); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bc8d7565171d..1ff712889a3b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -54,14 +54,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -1023,7 +1016,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) return set; } - /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -1033,8 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) + if (!md_bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1063,6 +1054,35 @@ void md_bitmap_unplug(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_unplug); +struct bitmap_unplug_work { + struct work_struct work; + struct bitmap *bitmap; + struct completion *done; +}; + +static void md_bitmap_unplug_fn(struct work_struct *work) +{ + struct bitmap_unplug_work *unplug_work = + container_of(work, struct bitmap_unplug_work, work); + + md_bitmap_unplug(unplug_work->bitmap); + complete(unplug_work->done); +} + +void md_bitmap_unplug_async(struct bitmap *bitmap) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct bitmap_unplug_work unplug_work; + + INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); + unplug_work.bitmap = bitmap; + unplug_work.done = &done; + + queue_work(md_bitmap_wq, &unplug_work.work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(md_bitmap_unplug_async); + static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the @@ -1241,11 +1261,28 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); +static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, + bool force) +{ + struct md_thread *thread; + + rcu_read_lock(); + thread = rcu_dereference(mddev->thread); + + if (!thread) + goto out; + + if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) + thread->timeout = timeout; + +out: + rcu_read_unlock(); +} + /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ - void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; @@ -1269,7 +1306,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; @@ -1366,8 +1403,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1387,6 +1423,14 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || @@ -1820,8 +1864,7 @@ void md_bitmap_destroy(struct mddev *mddev) mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } @@ -1964,7 +2007,7 @@ int md_bitmap_load(struct mddev *mddev) /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); md_bitmap_update_sb(bitmap); @@ -2469,17 +2512,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { - mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } - } + mddev_set_timeout(mddev, timeout, false); + md_wakeup_thread(mddev->thread); + return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index cfd7395de8fd..8a3788c9bfef 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t new_lo, sector_t new_hi); void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, @@ -273,6 +274,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + #endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..3d9fd74233df 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -75,14 +75,14 @@ struct md_cluster_info { sector_t suspend_hi; int suspend_from; /* the slot which broadcast suspend_lo/hi */ - struct md_thread *recovery_thread; + struct md_thread __rcu *recovery_thread; unsigned long recovery_map; /* communication loc resources */ struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; struct dlm_lock_resource *no_new_dev_lockres; - struct md_thread *recv_thread; + struct md_thread __rcu *recv_thread; struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot) set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { - cinfo->recovery_thread = md_register_thread(recover_bitmaps, - mddev, "recover"); + rcu_assign_pointer(cinfo->recovery_thread, + md_register_thread(recover_bitmaps, mddev, "recover")); if (!cinfo->recovery_thread) { pr_warn("md-cluster: Could not create recovery thread\n"); return; @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { int got_lock = 0; + struct md_thread *thread; struct md_cluster_info *cinfo = mddev->cluster_info; mddev->good_device_nr = le32_to_cpu(msg->raid_slot); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - wait_event(mddev->thread->wqueue, + + /* daemaon thread must exist */ + thread = rcu_dereference_protected(mddev->thread, true); + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); md_reload_sb(mddev, mddev->good_device_nr); @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes) } /* Initiate the communication resources */ ret = -ENOMEM; - cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + rcu_assign_pointer(cinfo->recv_thread, + md_register_thread(recv_daemon, mddev, "cluster_recv")); if (!cinfo->recv_thread) { pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); goto err; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 66edf5e72bd6..92c45be203d7 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev) if (ret) goto out_free_conf; - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); + rcu_assign_pointer(mddev->thread, + md_register_thread(multipathd, mddev, "multipath")); if (!mddev->thread) goto out_free_conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e344b4b3444..cf3733c90c47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -70,11 +70,7 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ +/* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -87,23 +83,13 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; -static struct workqueue_struct *md_rdev_misc_wq; +struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); - -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -360,10 +346,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -static bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -457,13 +439,19 @@ static void md_submit_bio(struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); - lockdep_assert_held(&mddev->reconfig_mutex); + struct md_thread *thread = rcu_dereference_protected(mddev->thread, + lockdep_is_held(&mddev->reconfig_mutex)); + + WARN_ON_ONCE(thread && current == thread->tsk); if (mddev->suspended++) return; wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + + if (mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); @@ -655,9 +643,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -759,6 +749,24 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + mutex_lock(&mddev->delete_mutex); + if (list_empty(&mddev->deleting)) + goto out; + + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } +out: + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -800,13 +808,10 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); + md_free_rdev(mddev); + md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -938,7 +943,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -979,7 +984,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else bio.bi_iter.bi_sector = sector + rdev->data_offset; - bio_add_page(&bio, page, size, 0); + __bio_add_page(&bio, page, size, 0); submit_bio_wait(&bio); @@ -2440,16 +2445,12 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); -static void export_rdev(struct md_rdev *rdev) +/* just for claiming the bdev */ +static struct md_rdev claim_rdev; + +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); @@ -2457,13 +2458,15 @@ static void export_rdev(struct md_rdev *rdev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev); rdev->bdev = NULL; kobject_put(&rdev->kobj); } static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2477,15 +2480,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3553,6 +3558,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3560,6 +3566,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3568,6 +3578,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -3612,6 +3626,7 @@ int md_rdev_init(struct md_rdev *rdev) return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); + /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -3624,7 +3639,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init); */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { - static struct md_rdev claim_rdev; /* just for claiming the bdev */ struct md_rdev *rdev; sector_t size; int err; @@ -3640,9 +3654,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - rdev->bdev = blkdev_get_by_dev(newdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - super_format == -2 ? &claim_rdev : rdev); + rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, + super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); @@ -3679,7 +3692,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(rdev->bdev, super_format == -2 ? &claim_rdev : rdev); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: @@ -3794,8 +3807,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3807,7 +3821,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; @@ -4477,6 +4491,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } @@ -4491,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4532,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -4560,7 +4561,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); mddev_unlock(mddev); if (!err) md_new_event(); @@ -4804,11 +4805,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } @@ -5592,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -6269,10 +6279,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; @@ -6333,10 +6345,12 @@ static int do_md_stop(struct mddev *mddev, int mode, } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && @@ -6498,7 +6512,7 @@ static void autorun_devices(int part) rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev); + export_rdev(rdev, mddev); } autorun_array(mddev); mddev_unlock(mddev); @@ -6508,7 +6522,7 @@ static void autorun_devices(int part) */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev); + export_rdev(rdev, mddev); } mddev_put(mddev); } @@ -6696,13 +6710,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); - export_rdev(rdev); + export_rdev(rdev, mddev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } @@ -6733,7 +6747,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; @@ -6746,7 +6759,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev); + export_rdev(rdev, mddev); return -EINVAL; } @@ -6772,7 +6785,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) } } if (has_journal || mddev->bitmap) { - export_rdev(rdev); + export_rdev(rdev, mddev); return -EBUSY; } set_bit(Journal, &rdev->flags); @@ -6787,7 +6800,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* --add initiated by this node */ err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } } @@ -6797,7 +6810,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { @@ -6860,7 +6873,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } } @@ -6985,7 +6998,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return 0; abort_export: - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } @@ -7486,7 +7499,7 @@ static int __md_set_array_info(struct mddev *mddev, void __user *argp) return err; } -static int md_ioctl(struct block_device *bdev, fmode_t mode, +static int md_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; @@ -7555,9 +7568,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -7718,7 +7728,7 @@ out: return err; } #ifdef CONFIG_COMPAT -static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -7767,13 +7777,13 @@ out_unlock: return err; } -static int md_open(struct block_device *bdev, fmode_t mode) +static int md_open(struct gendisk *disk, blk_mode_t mode) { struct mddev *mddev; int err; spin_lock(&all_mddevs_lock); - mddev = mddev_get(bdev->bd_disk->private_data); + mddev = mddev_get(disk->private_data); spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; @@ -7789,7 +7799,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - bdev_check_media_change(bdev); + disk_check_media_change(disk); return 0; out_unlock: @@ -7799,7 +7809,7 @@ out: return err; } -static void md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk) { struct mddev *mddev = disk->private_data; @@ -7886,13 +7896,29 @@ static int md_thread(void *arg) return 0; } -void md_wakeup_thread(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); +} + +void md_wakeup_thread(struct md_thread __rcu *thread) +{ + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + wake_up(&t->wqueue); } + rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); @@ -7922,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct md_thread __rcu **threadp) { - struct md_thread *thread; + struct md_thread *thread = rcu_dereference_protected(*threadp, true); - /* - * Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - thread = *threadp; - if (!thread) { - spin_unlock(&pers_lock); + if (!thread) return; - } - *threadp = NULL; - spin_unlock(&pers_lock); + + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -9100,6 +9119,7 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); + wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } @@ -9202,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); @@ -9619,9 +9638,10 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + goto err_bitmap_wq; ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) @@ -9641,8 +9661,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: + destroy_workqueue(md_bitmap_wq); +err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9938,8 +9958,8 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); + destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index fd8f260ed5f8..bfd2306bc750 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -122,8 +122,6 @@ struct md_rdev { struct serial_in_rdev *serial; /* used for raid1 io serialization */ - struct work_struct del_work; /* used for delayed sysfs removal */ - struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ @@ -367,8 +365,8 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ + struct md_thread __rcu *thread; /* management thread */ + struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* 'last_sync_action' is initialized to "none". It is set when a * sync operation (i.e "data-check", "requested-resync", "resync", @@ -531,6 +529,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; @@ -555,6 +561,23 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -614,6 +637,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -734,8 +758,8 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); +extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); @@ -828,6 +852,7 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; +extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index e61f6cad4e08..169ebe296f2d 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -21,6 +21,7 @@ #define IO_MADE_GOOD ((struct bio *)2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +#define MAX_PLUG_BIO 32 /* for managing resync I/O pages */ struct resync_pages { @@ -31,6 +32,7 @@ struct resync_pages { struct raid1_plug_cb { struct blk_plug_cb cb; struct bio_list pending; + unsigned int count; }; static void rbio_pool_free(void *rbio, void *data) @@ -101,11 +103,73 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, struct page *page = resync_fetch_page(rp, idx); int len = min_t(int, size, PAGE_SIZE); - /* - * won't fail because the vec table is big - * enough to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug, int copies) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + if (++plug->count / MAX_PLUG_BIO >= copies) { + list_del(&cb->list); + cb->callback(cb, false); + } + + + return true; +} + +/* + * current->bio_list will be set under submit_bio() context, in this case bitmap + * io will be added to the list and wait for current io submission to finish, + * while current io submission must wait for bitmap io to be done. In order to + * avoid such deadlock, submit bitmap io asynchronously. + */ +static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +{ + if (current->bio_list) + md_bitmap_unplug_async(bitmap); + else + md_bitmap_unplug(bitmap); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 68a9e2d9985b..dd25832eb045 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,22 +794,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -1147,7 +1138,10 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, if (unlikely(!page)) goto free_pages; - bio_add_page(behind_bio, page, len, 0); + if (!bio_add_page(behind_bio, page, len, 0)) { + put_page(page); + goto free_pages; + } size -= len; i++; @@ -1175,7 +1169,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); @@ -1343,8 +1337,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; bool write_behind = false; @@ -1573,15 +1565,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; - - cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -2914,7 +2898,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * won't fail because the vec table is big * enough to hold all these pages */ - bio_add_page(bio, page, len, 0); + __bio_add_page(bio, page, len, 0); } } nr_sectors += len>>9; @@ -3084,7 +3068,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, "raid1"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid1d, mddev, "raid1")); if (!conf->thread) goto abort; @@ -3177,8 +3162,8 @@ static int raid1_run(struct mddev *mddev) /* * Ok, everything is just fine now */ - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ebb6788820e7..468f189da7a0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -130,7 +130,7 @@ struct r1conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* Keep track of cluster resync window to send to other * nodes. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4fcfcb350d2b..d0de8c9fb3cf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -779,8 +779,16 @@ static struct md_rdev *read_balance(struct r10conf *conf, disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev. + */ + smp_mb(); rdev = rcu_dereference(conf->mirrors[disk].rdev); + } if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -902,25 +910,15 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - /* flush any pending bitmap writes to disk - * before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; + cond_resched(); } blk_finish_plug(&plug); } else @@ -982,6 +980,7 @@ static void lower_barrier(struct r10conf *conf) static bool stop_waiting_barrier(struct r10conf *conf) { struct bio_list *bio_list = current->bio_list; + struct md_thread *thread; /* barrier is dropped */ if (!conf->barrier) @@ -997,12 +996,14 @@ static bool stop_waiting_barrier(struct r10conf *conf) (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) return true; + /* daemon thread must exist while handling io */ + thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done. */ - if (conf->mddev->thread->tsk == current) { + if (thread->tsk == current) { WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); return true; } @@ -1113,7 +1114,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); @@ -1125,23 +1126,15 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - md_bitmap_unplug(mddev->bitmap); + raid1_prepare_flush_writes(mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; + cond_resched(); } kfree(plug); } @@ -1282,8 +1275,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; unsigned long flags; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int devnum = r10_bio->devs[n_copy].devnum; @@ -1323,14 +1314,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, atomic_inc(&r10_bio->remaining); - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1479,9 +1463,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(conf->mirrors[d].replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == rrdev) rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) @@ -2148,9 +2138,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = -EEXIST; - int mirror; + int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; + struct raid10_info *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -2173,23 +2164,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) else mirror = first; for ( ; mirror <= last ; mirror++) { - struct raid10_info *p = &conf->mirrors[mirror]; + p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { - if (!test_bit(WantReplacement, &p->rdev->flags) || - p->replacement != NULL) - continue; - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL && repl_slot < 0) + repl_slot = mirror; + continue; } if (mddev->gendisk) @@ -2206,6 +2188,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } + if (err && repl_slot >= 0) { + p = &conf->mirrors[repl_slot]; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + } + print_conf(conf); return err; } @@ -3303,6 +3298,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; + int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -3386,8 +3382,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return reshape_request(mddev, sector_nr, skipped); if (chunks_skipped >= conf->geo.raid_disks) { - /* if there has been nothing to do on any drive, - * then there is nothing to do at all.. + pr_err("md/raid10:%s: %s fails\n", mdname(mddev), + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); + if (error_disk >= 0 && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* + * recovery fails, set mirrors.recovery_disabled, + * device shouldn't be added to there. + */ + conf->mirrors[error_disk].recovery_disabled = + mddev->recovery_disabled; + return 0; + } + /* + * if there has been nothing to do on any drive, + * then there is nothing to do at all. */ *skipped = 1; return (max_sector - sector_nr) + sectors_skipped; @@ -3437,8 +3446,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - int need_recover = 0; - int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3446,15 +3453,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mrdev = rcu_dereference(mirror->rdev); mreplace = rcu_dereference(mirror->replacement); - if (mrdev != NULL && - !test_bit(Faulty, &mrdev->flags) && - !test_bit(In_sync, &mrdev->flags)) - need_recover = 1; - if (mreplace != NULL && - !test_bit(Faulty, &mreplace->flags)) - need_replace = 1; + if (mrdev && (test_bit(Faulty, &mrdev->flags) || + test_bit(In_sync, &mrdev->flags))) + mrdev = NULL; + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; - if (!need_recover && !need_replace) { + if (!mrdev && !mreplace) { rcu_read_unlock(); continue; } @@ -3470,8 +3475,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - if (mreplace && test_bit(Faulty, &mreplace->flags)) - mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3490,7 +3493,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - atomic_inc(&mrdev->nr_pending); + if (mrdev) + atomic_inc(&mrdev->nr_pending); if (mreplace) atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); @@ -3577,7 +3581,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - if (need_recover) { + if (mrdev) { bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -3594,11 +3598,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - /* Note: if need_replace, then bio + /* Note: if replace is not NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. */ - if (!need_replace) + if (!mreplace) break; bio->bi_next = biolist; biolist = bio; @@ -3622,7 +3626,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (!test_bit(In_sync, + if (mrdev && !test_bit(In_sync, &mrdev->flags) && !rdev_set_badblocks( mrdev, @@ -3643,17 +3647,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mdname(mddev)); mirror->recovery_disabled = mddev->recovery_disabled; + } else { + error_disk = i; } put_buf(r10_bio); if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); break; } - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { @@ -3819,11 +3827,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (bio= biolist ; bio ; bio=bio->bi_next) { struct resync_pages *rp = get_resync_pages(bio); page = resync_fetch_page(rp, page_idx); - /* - * won't fail because the vec table is big enough - * to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + goto giveup; + } } nr_sectors += len>>9; sector_nr += len>>9; @@ -4107,7 +4115,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) atomic_set(&conf->nr_pending, 0); err = -ENOMEM; - conf->thread = md_register_thread(raid10d, mddev, "raid10"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid10d, mddev, "raid10")); if (!conf->thread) goto out; @@ -4152,8 +4161,8 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -4296,8 +4305,8 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto out_free_conf; } @@ -4698,8 +4707,8 @@ out: set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { ret = -EAGAIN; goto abort; @@ -4997,11 +5006,11 @@ read_more: if (len > PAGE_SIZE) len = PAGE_SIZE; for (bio = blist; bio ; bio = bio->bi_next) { - /* - * won't fail because the vec table is big enough - * to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return sectors_done; + } } sector_nr += len >> 9; nr_sectors += len >> 9; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 8c072ce0bc54..63e48b11b552 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -100,7 +100,7 @@ struct r10conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* * Keep track of cluster resync window to send to other nodes. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 46182b955aef..47ba7d9e81e1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -120,7 +120,7 @@ struct r5l_log { struct bio_set bs; mempool_t meta_pool; - struct md_thread *reclaim_thread; + struct md_thread __rcu *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be * reclaimed. if it's 0, reclaim spaces * used by io_units which are in @@ -792,7 +792,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) io->current_bio = r5l_bio_alloc(log); io->current_bio->bi_end_io = r5l_log_endio; io->current_bio->bi_private = io; - bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + __bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); r5_reserve_log_entry(log, io); @@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) void r5l_quiesce(struct r5l_log *log, int quiesce) { - struct mddev *mddev; + struct mddev *mddev = log->rdev->mddev; + struct md_thread *thread = rcu_dereference_protected( + log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ - mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - kthread_park(log->reclaim_thread->tsk); + kthread_park(thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); } else - kthread_unpark(log->reclaim_thread->tsk); + kthread_unpark(thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct r5l_log *log; + struct md_thread *thread; int ret; pr_debug("md/raid:%s: using device %pg as journal\n", @@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->tree_lock); INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - if (!log->reclaim_thread) + thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, + "reclaim"); + if (!thread) goto reclaim_thread; - log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + rcu_assign_pointer(log->reclaim_thread, thread); init_waitqueue_head(&log->iounit_wait); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e495939bb3e0..eaea57aee602 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -465,7 +465,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio->bi_end_io = ppl_log_endio; bio->bi_iter.bi_sector = log->next_io_sector; - bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + __bio_add_page(bio, io->header_page, PAGE_SIZE, 0); pr_debug("%s: log->current_io_sector: %llu\n", __func__, (unsigned long long)log->next_io_sector); @@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) prev->bi_opf, GFP_NOIO, &ppl_conf->bs); bio->bi_iter.bi_sector = bio_end_sector(prev); - bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); + __bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); bio_chain(bio, prev); ppl_submit_iounit_bio(io, prev); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4739ed891e75..6615abf54d3f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5516,7 +5516,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, &dd_idx, NULL); - end_sector = bio_end_sector(raid_bio); + end_sector = sector + bio_sectors(raid_bio); rcu_read_lock(); if (r5c_big_stripe_cached(conf, sector)) @@ -5966,6 +5966,19 @@ out: return ret; } +static bool reshape_inprogress(struct mddev *mddev) +{ + return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery); +} + +static bool reshape_disabled(struct mddev *mddev) +{ + return is_md_suspended(mddev) || !md_is_rdwr(mddev); +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5997,7 +6010,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6076,6 +6090,15 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && + reshape_disabled(mddev)) { + bi->bi_status = BLK_STS_IOERR; + ret = STRIPE_FAIL; + pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", + mdname(mddev)); + } + return ret; } @@ -7708,7 +7731,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) } sprintf(pers_name, "raid%d", mddev->new_level); - conf->thread = md_register_thread(raid5d, mddev, pers_name); + rcu_assign_pointer(conf->thread, + md_register_thread(raid5d, mddev, pers_name)); if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); @@ -7931,8 +7955,8 @@ static int raid5_run(struct mddev *mddev) } conf->min_offset_diff = min_offset_diff; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; @@ -8029,8 +8053,8 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto abort; } @@ -8377,6 +8401,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->disks + disk; tmp = rdev_mdlock_deref(mddev, p->rdev); if (test_bit(WantReplacement, &tmp->flags) && + mddev->reshape_position == MaxSector && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); @@ -8500,6 +8525,7 @@ static int raid5_start_reshape(struct mddev *mddev) struct r5conf *conf = mddev->private; struct md_rdev *rdev; int spares = 0; + int i; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -8511,6 +8537,13 @@ static int raid5_start_reshape(struct mddev *mddev) if (has_failed(conf)) return -EINVAL; + /* raid5 can't handle concurrent reshape and recovery */ + if (mddev->recovery_cp < MaxSector) + return -EBUSY; + for (i = 0; i < conf->raid_disks; i++) + if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + return -EBUSY; + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -8607,8 +8640,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); @@ -9043,6 +9076,22 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || + percpu_ref_is_zero(&mddev->active_io)); + if (percpu_ref_is_zero(&mddev->active_io)) + return; + + /* + * Reshape is not in progress, and array is suspended, io that is + * waiting for reshpape can never be done. + */ + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -9063,6 +9112,7 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9087,6 +9137,7 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9112,6 +9163,7 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e873938a6125..f19707189a7b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -679,7 +679,7 @@ struct r5conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; |