diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 18 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 255 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 84 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 164 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 19 | ||||
-rw-r--r-- | drivers/md/dm-bio-record.h | 15 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 84 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-verity-target.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 10 | ||||
-rw-r--r-- | drivers/md/dm.c | 32 | ||||
-rw-r--r-- | drivers/md/md.c | 11 |
19 files changed, 552 insertions, 191 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8bc1faf71ff2..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -67,7 +67,6 @@ #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/random.h> -#include <linux/sched/signal.h> #include <trace/events/bcache.h> #define MAX_OPEN_BUCKETS 128 @@ -734,21 +733,8 @@ int bch_open_buckets_alloc(struct cache_set *c) int bch_cache_allocator_start(struct cache *ca) { - struct task_struct *k; - - /* - * In case previous btree check operation occupies too many - * system memory for bcache btree node cache, and the - * registering process is selected by OOM killer. Here just - * ignore the SIGKILL sent by OOM killer if there is, to - * avoid kthread_run() being failed by pending signals. The - * bcache registering process will exit after the registration - * done. - */ - if (signal_pending(current)) - flush_signals(current); - - k = kthread_run(bch_allocator_thread, ca, "bcache_allocator"); + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b12186c87f52..72856e5f23a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -34,7 +34,6 @@ #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> -#include <linux/sched/signal.h> #include <linux/rculist.h> #include <linux/delay.h> #include <trace/events/bcache.h> @@ -102,64 +101,6 @@ #define insert_lock(s, b) ((b)->level <= (s)->lock) -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ - _w, b); \ - if (!IS_ERR(_child)) { \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -EINTR) \ - schedule(); \ - } while (_r == -EINTR); \ - \ - finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ - _r; \ -}) static inline struct bset *write_block(struct btree *b) { @@ -1849,7 +1790,7 @@ static void bch_btree_gc(struct cache_set *c) /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { - ret = btree_root(gc_root, c, &op, &writes, &stats); + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); cond_resched(); @@ -1914,18 +1855,6 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { - /* - * In case previous btree check operation occupies too many - * system memory for bcache btree node cache, and the - * registering process is selected by OOM killer. Here just - * ignore the SIGKILL sent by OOM killer if there is, to - * avoid kthread_run() being failed by pending signals. The - * bcache registering process will exit after the registration - * done. - */ - if (signal_pending(current)) - flush_signals(current); - c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } @@ -1959,7 +1888,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) } if (p) - ret = btree(check_recurse, p, b, op); + ret = bcache_btree(check_recurse, p, b, op); p = k; } while (p && !ret); @@ -1968,13 +1897,176 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) return ret; } + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i, n; + + k = p = NULL; + i = n = 0; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + int bch_btree_check(struct cache_set *c) { - struct btree_op op; + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state *check_state; + char name[32]; - bch_btree_op_init(&op, SHRT_MAX); + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); + if (!check_state) + return -ENOMEM; + + check_state->c = c; + check_state->total_threads = bch_btree_chkthread_nr(); + check_state->key_idx = 0; + spin_lock_init(&check_state->idx_lock); + atomic_set(&check_state->started, 0); + atomic_set(&check_state->enough, 0); + init_waitqueue_head(&check_state->wait); - return btree_root(check_recurse, c, &op); + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state->enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state->total_threads; i++) { + /* fetch latest check_state->enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&check_state->enough)) + break; + + check_state->infos[i].result = 0; + check_state->infos[i].state = check_state; + snprintf(name, sizeof(name), "bch_btrchk[%u]", i); + atomic_inc(&check_state->started); + + check_state->infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state->infos[i], + name); + if (IS_ERR(check_state->infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(check_state->infos[i].thread); + ret = -ENOMEM; + goto out; + } + } + + wait_event_interruptible(check_state->wait, + atomic_read(&check_state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + for (i = 0; i < check_state->total_threads; i++) { + if (check_state->infos[i].result) { + ret = check_state->infos[i].result; + goto out; + } + } + +out: + kfree(check_state); + return ret; } void bch_initial_gc_finish(struct cache_set *c) @@ -2414,7 +2506,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { - ret = btree(map_nodes_recurse, k, b, + ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); from = NULL; @@ -2432,10 +2524,10 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { - return btree_root(map_nodes_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); } -static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { @@ -2448,7 +2540,8 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) - : btree(map_keys_recurse, k, b, op, from, fn, flags); + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) @@ -2465,7 +2558,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { - return btree_root(map_keys_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f4dcca449391..257969980c49 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -145,6 +145,9 @@ struct btree { struct bio *bio; }; + + + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ @@ -216,6 +219,25 @@ struct btree_op { unsigned int insert_collision:1; }; +struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 64 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); @@ -284,6 +306,65 @@ static inline void force_wake_up_gc(struct cache_set *c) wake_up_gc(c); } +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define bcache_btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define bcache_btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + #define MAP_DONE 0 #define MAP_CONTINUE 1 @@ -314,6 +395,9 @@ typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, struct bkey *k); int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 820d8402a1dc..71a90fbec314 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1161,8 +1161,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_disk->private_data; @@ -1266,7 +1265,6 @@ void bch_cached_dev_request_init(struct cached_dev *dc) { struct gendisk *g = dc->disk.disk; - g->queue->make_request_fn = cached_dev_make_request; g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; @@ -1301,8 +1299,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct closure *cl; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c64dbd7a91aa..bb005c93dd72 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,7 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio); + void bch_flash_dev_request_init(struct bcache_device *d); +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0c3c5419c52b..d98354fa28e3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors) + sector_t sectors, make_request_fn make_request_fn) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, @@ -866,11 +866,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); if (!q) return -ENOMEM; - blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; @@ -1339,7 +1338,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + cached_dev_make_request); if (ret) return ret; @@ -1451,7 +1451,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors)) + if (bcache_device_init(d, block_bytes(c), u->sectors, + flash_dev_make_request)) goto err; bcache_device_attach(d, c, u - c->uuids); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3470fae4eabc..323276994aab 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -154,7 +154,7 @@ static ssize_t bch_snprint_string_list(char *buf, size_t i; for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, + out += scnprintf(out, buf + size - out, i == selected ? "[%s] " : "%s ", list[i]); out[-1] = '\n'; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4a40f9eadeaf..3f7641fb28d5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -183,7 +183,7 @@ static void update_writeback_rate(struct work_struct *work) */ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -193,7 +193,7 @@ static void update_writeback_rate(struct work_struct *work) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); return; } @@ -229,7 +229,7 @@ static void update_writeback_rate(struct work_struct *work) */ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); } static unsigned int writeback_delay(struct cached_dev *dc, @@ -785,7 +785,9 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct bcache_device *d) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { struct sectors_dirty_init op; int ret; @@ -796,8 +798,13 @@ void bch_sectors_dirty_init(struct bcache_device *d) op.start = KEY(op.inode, 0, 0); do { - ret = bch_btree_map_keys(&op.op, d->c, &op.start, - sectors_dirty_init_fn, 0); + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &op.start, + sectors_dirty_init_fn, + 0); if (ret == -EAGAIN) schedule_timeout_interruptible( msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); @@ -806,6 +813,151 @@ void bch_sectors_dirty_init(struct bcache_device *d) break; } } while (ret == -EAGAIN); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i; + + k = p = NULL; + i = 0; + cur_idx = prev_idx = 0; + + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state *state; + char name[32]; + + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); + return; + } + + state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); + if (!state) { + pr_warn("sectors dirty init failed: cannot allocate memory"); + return; + } + + state->c = c; + state->d = d; + state->total_threads = bch_btre_dirty_init_thread_nr(); + state->key_idx = 0; + spin_lock_init(&state->idx_lock); + atomic_set(&state->started, 0); + atomic_set(&state->enough, 0); + init_waitqueue_head(&state->wait); + + for (i = 0; i < state->total_threads; i++) { + /* Fetch latest state->enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state->enough)) + break; + + state->infos[i].state = state; + atomic_inc(&state->started); + snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); + + state->infos[i].thread = + kthread_run(bch_dirty_init_thread, + &state->infos[i], + name); + if (IS_ERR(state->infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(state->infos[i].thread); + goto out; + } + } + + wait_event_interruptible(state->wait, + atomic_read(&state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + +out: + kfree(state); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 4e4c6810dc3c..b029843ce5b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,7 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_DIRTY_INIT_THRD_MAX 64 /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -23,6 +24,24 @@ */ #define WRITEBACK_SHARE_SHIFT 14 +struct bch_dirty_init_state; +struct dirty_init_thrd_info { + struct bch_dirty_init_state *state; + struct task_struct *thread; +}; + +struct bch_dirty_init_state { + struct cache_set *c; + struct bcache_device *d; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; +}; + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index c82578af56a5..2ea0360108e1 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -20,8 +20,13 @@ struct dm_bio_details { struct gendisk *bi_disk; u8 bi_partno; + int __bi_remaining; unsigned long bi_flags; struct bvec_iter bi_iter; + bio_end_io_t *bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; +#endif }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) @@ -30,6 +35,11 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) bd->bi_partno = bio->bi_partno; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; + bd->__bi_remaining = atomic_read(&bio->__bi_remaining); + bd->bi_end_io = bio->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bd->bi_integrity = bio_integrity(bio); +#endif } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) @@ -38,6 +48,11 @@ static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) bio->bi_partno = bd->bi_partno; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; + atomic_set(&bio->__bi_remaining, bd->__bi_remaining); + bio->bi_end_io = bd->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bio->bi_integrity = bd->bi_integrity; +#endif } #endif diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2d32821b3a5b..d3bb355819a4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2846,8 +2846,8 @@ static void cache_postsuspend(struct dm_target *ti) prevent_background_work(cache); BUG_ON(atomic_read(&cache->nr_io_migrations)); - cancel_delayed_work(&cache->waker); - flush_workqueue(cache->wq); + cancel_delayed_work_sync(&cache->waker); + drain_workqueue(cache->wq); WARN_ON(cache->tracker.in_flight); /* @@ -3492,7 +3492,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 1, 0}, + .version = {2, 2, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index b225b3e445fa..2f03fecd312d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -6,6 +6,8 @@ * This file is released under the GPL. */ +#include "dm-bio-record.h" + #include <linux/compiler.h> #include <linux/module.h> #include <linux/device-mapper.h> @@ -201,17 +203,19 @@ struct dm_integrity_c { __u8 log2_blocks_per_bitmap_bit; unsigned char mode; - int suspending; int failed; struct crypto_shash *internal_hash; + struct dm_target *ti; + /* these variables are locked with endio_wait.lock */ struct rb_root in_progress; struct list_head wait_list; wait_queue_head_t endio_wait; struct workqueue_struct *wait_wq; + struct workqueue_struct *offload_wq; unsigned char commit_seq; commit_id_t commit_ids[N_COMMIT_IDS]; @@ -293,11 +297,7 @@ struct dm_integrity_io { struct completion *completion; - struct gendisk *orig_bi_disk; - u8 orig_bi_partno; - bio_end_io_t *orig_bi_end_io; - struct bio_integrity_payload *orig_bi_integrity; - struct bvec_iter orig_bi_iter; + struct dm_bio_details bio_details; }; struct journal_completion { @@ -1439,7 +1439,7 @@ static void dec_in_flight(struct dm_integrity_io *dio) dio->range.logical_sector += dio->range.n_sectors; bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); return; } do_endio_flush(ic, dio); @@ -1450,14 +1450,9 @@ static void integrity_end_io(struct bio *bio) { struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - bio->bi_iter = dio->orig_bi_iter; - bio->bi_disk = dio->orig_bi_disk; - bio->bi_partno = dio->orig_bi_partno; - if (dio->orig_bi_integrity) { - bio->bi_integrity = dio->orig_bi_integrity; + dm_bio_restore(&dio->bio_details, bio); + if (bio->bi_integrity) bio->bi_opf |= REQ_INTEGRITY; - } - bio->bi_end_io = dio->orig_bi_end_io; if (dio->completion) complete(dio->completion); @@ -1542,7 +1537,7 @@ static void integrity_metadata(struct work_struct *w) } } - __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos; char *mem, *checksums_ptr; @@ -1586,7 +1581,7 @@ again: if (likely(checksums != checksums_onstack)) kfree(checksums); } else { - struct bio_integrity_payload *bip = dio->orig_bi_integrity; + struct bio_integrity_payload *bip = dio->bio_details.bi_integrity; if (bip) { struct bio_vec biv; @@ -1865,7 +1860,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map if (need_sync_io && from_map) { INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->metadata_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); return; } @@ -2005,20 +2000,13 @@ offload_to_thread: } else dio->completion = NULL; - dio->orig_bi_iter = bio->bi_iter; - - dio->orig_bi_disk = bio->bi_disk; - dio->orig_bi_partno = bio->bi_partno; + dm_bio_record(&dio->bio_details, bio); bio_set_dev(bio, ic->dev->bdev); - - dio->orig_bi_integrity = bio_integrity(bio); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; - - dio->orig_bi_end_io = bio->bi_end_io; bio->bi_end_io = integrity_end_io; - bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + generic_make_request(bio); if (need_sync_io) { @@ -2315,7 +2303,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (READ_ONCE(ic->suspending) && !ic->meta_dev) + if (unlikely(dm_suspended(ic->ti)) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2376,7 +2364,7 @@ static void integrity_recalc(struct work_struct *w) next_chunk: - if (unlikely(READ_ONCE(ic->suspending))) + if (unlikely(dm_suspended(ic->ti))) goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); @@ -2501,7 +2489,7 @@ static void bitmap_block_work(struct work_struct *w) dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); } else { block_bitmap_op(ic, ic->journal, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); @@ -2524,7 +2512,7 @@ static void bitmap_block_work(struct work_struct *w) remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); } queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); @@ -2804,8 +2792,6 @@ static void dm_integrity_postsuspend(struct dm_target *ti) del_timer_sync(&ic->autocommit_timer); - WRITE_ONCE(ic->suspending, 1); - if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); @@ -2834,8 +2820,6 @@ static void dm_integrity_postsuspend(struct dm_target *ti) #endif } - WRITE_ONCE(ic->suspending, 0); - BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); ic->journal_uptodate = true; @@ -2888,17 +2872,24 @@ static void dm_integrity_resume(struct dm_target *ti) } else { replay_journal(ic); if (ic->mode == 'B') { - int mode; ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); - mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR; - block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); - block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); - block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) { + block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + } rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); } @@ -2967,7 +2958,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" meta_device:%s", ic->meta_dev->name); if (ic->sectors_per_block != 1) DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); - if (ic->recalculate_flag) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); @@ -3623,6 +3614,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = ic; ti->per_io_data_size = sizeof(struct dm_integrity_io); + ic->ti = ti; ic->in_progress = RB_ROOT; INIT_LIST_HEAD(&ic->wait_list); @@ -3836,6 +3828,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM, + METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->offload_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); if (!ic->commit_wq) { ti->error = "Cannot allocate workqueue"; @@ -4140,6 +4140,8 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->metadata_wq); if (ic->wait_wq) destroy_workqueue(ic->wait_wq); + if (ic->offload_wq) + destroy_workqueue(ic->offload_wq); if (ic->commit_wq) destroy_workqueue(ic->commit_wq); if (ic->writer_wq) @@ -4200,7 +4202,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2bc18c9c3abc..58fd137b6ae1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -2053,7 +2053,7 @@ static int multipath_busy(struct dm_target *ti) *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index fc9947d6210c..76b6b323bf4b 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -960,9 +960,9 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) DMWARN("%s: __commit_transaction() failed, error = %d", __func__, r); } + pmd_write_unlock(pmd); if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); - pmd_write_unlock(pmd); kfree(pmd); return 0; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 0d61e9c67986..eec9f252e935 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1221,7 +1221,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index b9e27e37a943..a09bdc000e64 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -625,6 +625,12 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry wc->freelist_size++; } +static inline void writecache_verify_watermark(struct dm_writecache *wc) +{ + if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) + queue_work(wc->writeback_wq, &wc->writeback_work); +} + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -650,8 +656,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, s list_del(&e->lru); } wc->freelist_size--; - if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) - queue_work(wc->writeback_wq, &wc->writeback_work); + + writecache_verify_watermark(wc); return e; } @@ -842,7 +848,7 @@ static void writecache_suspend(struct dm_target *ti) } wc_unlock(wc); - flush_workqueue(wc->writeback_wq); + drain_workqueue(wc->writeback_wq); wc_lock(wc); if (flush_on_suspend) @@ -965,6 +971,8 @@ erase_this: writecache_commit_flushed(wc, false); } + writecache_verify_watermark(wc); + wc_unlock(wc); } @@ -2312,7 +2320,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 70a1063161c0..f4f83d39b3dc 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -533,8 +533,9 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) /* Get the BIO chunk work. If one is not active yet, create one */ cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); - if (!cw) { - + if (cw) { + dmz_get_chunk_work(cw); + } else { /* Create a new chunk work */ cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); if (unlikely(!cw)) { @@ -543,7 +544,7 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) } INIT_WORK(&cw->work, dmz_chunk_work); - refcount_set(&cw->refcount, 0); + refcount_set(&cw->refcount, 1); cw->target = dmz; cw->chunk = chunk; bio_list_init(&cw->bio_list); @@ -556,7 +557,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) } bio_list_add(&cw->bio_list, bio); - dmz_get_chunk_work(cw); dmz_reclaim_bio_acc(dmz->reclaim); if (queue_work(dmz->chunk_wq, &cw->work)) @@ -967,7 +967,7 @@ static int dmz_iterate_devices(struct dm_target *ti, static struct target_type dmz_type = { .name = "zoned", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = dmz_ctr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b89f07ee2eff..753302e83910 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -25,6 +25,7 @@ #include <linux/wait.h> #include <linux/pr.h> #include <linux/refcount.h> +#include <linux/part_stat.h> #define DM_MSG_PREFIX "core" @@ -1788,7 +1789,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * With request-based DM we only need to check the * top-level queue for congestion. */ - r = md->queue->backing_dev_info->wb.state & bdi_bits; + struct backing_dev_info *bdi = md->queue->backing_dev_info; + r = bdi->wb.congested->state & bdi_bits; } else { map = dm_get_live_table_fast(md); if (map) @@ -1854,15 +1856,6 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); -static void dm_init_normal_md_queue(struct mapped_device *md) -{ - /* - * Initialize aspects of queue that aren't relevant for blk-mq - */ - md->queue->backing_dev_info->congested_data = md; - md->queue->backing_dev_info->congested_fn = dm_any_congested; -} - static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) @@ -1946,16 +1939,15 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); - if (!md->queue) - goto bad; - md->queue->queuedata = md; /* * default to bio-based required ->make_request_fn until DM * table is loaded and md->type established. If request-based * table is loaded: blk-mq will override accordingly. */ - blk_queue_make_request(md->queue, dm_make_request); + md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + if (!md->queue) + goto bad; + md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) @@ -2249,6 +2241,12 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); +static void dm_init_congested_fn(struct mapped_device *md) +{ + md->queue->backing_dev_info->congested_data = md; + md->queue->backing_dev_info->congested_fn = dm_any_congested; +} + /* * Setup the DM device's queue based on md's type */ @@ -2265,11 +2263,12 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } + dm_init_congested_fn(md); break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_NVME_BIO_BASED: - dm_init_normal_md_queue(md); + dm_init_congested_fn(md); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); @@ -2368,6 +2367,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); + set_bit(DMF_SUSPENDED, &md->flags); dm_table_postsuspend_targets(map); } /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..271e8a587354 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -58,8 +58,10 @@ #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> +#include <linux/raid/detect.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> +#include <linux/part_stat.h> #include <trace/events/block.h> #include "md.h" @@ -2491,12 +2493,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - pr_warn("md: could not open %s.\n", __bdevname(dev, b)); + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); return PTR_ERR(bdev); } rdev->bdev = bdev; @@ -5621,12 +5623,11 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); + mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); @@ -6184,7 +6185,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce) { + if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } |