From 97d26ae764a43bfaf870312761a0a0f9b49b6351 Mon Sep 17 00:00:00 2001 From: Li Lei Date: Tue, 20 Sep 2022 00:16:43 +0800 Subject: bcache: remove unnecessary flush_workqueue All pending works will be drained by destroy_workqueue(), no need to call flush_workqueue() explicitly. Signed-off-by: Li Lei Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220919161647.81238-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3f0ff3aab6f2..647661005176 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -801,10 +801,9 @@ static int bch_writeback_thread(void *arg) } } - if (dc->writeback_write_wq) { - flush_workqueue(dc->writeback_write_wq); + if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); - } + cached_dev_put(dc); wait_for_kthread_stop(); -- cgit v1.2.3 From d86b4e6dc88826f2b5cfa90c4ebbccb19a88bc39 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 20 Sep 2022 00:16:44 +0800 Subject: bcache: remove unused bch_mark_cache_readahead function def in stats.h This is a cleanup for commit 1616a4c2ab1a ("bcache: remove bcache device self-defined readahead")', currently no user for bch_mark_cache_readahead() since that commit. Signed-off-by: Lin Feng Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220919161647.81238-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/stats.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index ca4f435f7216..bd3afc856d53 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -54,7 +54,6 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc); void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, bool hit, bool bypass); -void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d); void bch_mark_sectors_bypassed(struct cache_set *c, -- cgit v1.2.3 From 11e529ccea33f24af6b54fe10bb3be9c1c48eddb Mon Sep 17 00:00:00 2001 From: Jules Maselbas Date: Tue, 20 Sep 2022 00:16:45 +0800 Subject: bcache: bset: Fix comment typos Remove the redundant word `by`, correct the typo `creaated`. CC: Kent Overstreet CC: linux-bcache@vger.kernel.org Signed-off-by: Jules Maselbas Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220919161647.81238-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 94d38e8a59b3..2bba4d6aaaa2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1264,7 +1264,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * * Don't worry event 'out' is allocated from mempool, it can * still be swapped here. Because state->pool is a page mempool - * creaated by by mempool_init_page_pool(), which allocates + * created by mempool_init_page_pool(), which allocates * pages by alloc_pages() indeed. */ -- cgit v1.2.3 From 6dd3be6923eec2c49860e7292e4e2783c74a9dff Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Tue, 20 Sep 2022 00:16:46 +0800 Subject: bcache:: fix repeated words in comments Delete the redundant word 'we'. Signed-off-by: Jilin Yuan Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20220919161647.81238-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2acda9cea0f9..aebb7ef10e63 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -107,7 +107,7 @@ * * BTREE NODES: * - * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node -- cgit v1.2.3 From d2d05b88035d2d51a5bb6c5afec88a0880c73df4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 20 Sep 2022 00:16:47 +0800 Subject: bcache: fix set_at_max_writeback_rate() for multiple attached devices Inside set_at_max_writeback_rate() the calculation in following if() check is wrong, if (atomic_inc_return(&c->idle_counter) < atomic_read(&c->attached_dev_nr) * 6) Because each attached backing device has its own writeback thread running and increasing c->idle_counter, the counter increates much faster than expected. The correct calculation should be, (counter / dev_nr) < dev_nr * 6 which equals to, counter < dev_nr * dev_nr * 6 This patch fixes the above mistake with correct calculation, and helper routine idle_counter_exceeded() is added to make code be more clear. Reported-by: Mingzhe Zou Signed-off-by: Coly Li Acked-by: Mingzhe Zou Link: https://lore.kernel.org/r/20220919161647.81238-6-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 73 ++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 647661005176..0285b676e983 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -157,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_target = target; } +static bool idle_counter_exceeded(struct cache_set *c) +{ + int counter, dev_nr; + + /* + * If c->idle_counter is overflow (idel for really long time), + * reset as 0 and not set maximum rate this time for code + * simplicity. + */ + counter = atomic_inc_return(&c->idle_counter); + if (counter <= 0) { + atomic_set(&c->idle_counter, 0); + return false; + } + + dev_nr = atomic_read(&c->attached_dev_nr); + if (dev_nr == 0) + return false; + + /* + * c->idle_counter is increased by writeback thread of all + * attached backing devices, in order to represent a rough + * time period, counter should be divided by dev_nr. + * Otherwise the idle time cannot be larger with more backing + * device attached. + * The following calculation equals to checking + * (counter / dev_nr) < (dev_nr * 6) + */ + if (counter < (dev_nr * dev_nr * 6)) + return false; + + return true; +} + +/* + * Idle_counter is increased every time when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { @@ -167,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c, /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; - /* - * Idle_counter is increased everytime when update_writeback_rate() is - * called. If all backing devices attached to the same cache set have - * identical dc->writeback_rate_update_seconds values, it is about 6 - * rounds of update_writeback_rate() on each backing device before - * c->at_max_writeback_rate is set to 1, and then max wrteback rate set - * to each dc->writeback_rate.rate. - * In order to avoid extra locking cost for counting exact dirty cached - * devices number, c->attached_dev_nr is used to calculate the idle - * throushold. It might be bigger if not all cached device are in write- - * back mode, but it still works well with limited extra rounds of - * update_writeback_rate(). - */ - if (atomic_inc_return(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) + + if (!idle_counter_exceeded(c)) return false; if (atomic_read(&c->at_max_writeback_rate) != 1) @@ -195,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c, dc->writeback_rate_change = 0; /* - * Check c->idle_counter and c->at_max_writeback_rate agagain in case - * new I/O arrives during before set_at_max_writeback_rate() returns. - * Then the writeback rate is set to 1, and its new value should be - * decided via __update_writeback_rate(). + * In case new I/O arrives during before + * set_at_max_writeback_rate() returns. */ - if ((atomic_read(&c->idle_counter) < - atomic_read(&c->attached_dev_nr) * 6) || + if (!idle_counter_exceeded(c) || !atomic_read(&c->at_max_writeback_rate)) return false; -- cgit v1.2.3 From 12ba6676b9254bc5a555f1c52f9c0516e86392b7 Mon Sep 17 00:00:00 2001 From: XU pengfei Date: Wed, 17 Aug 2022 17:21:40 +0800 Subject: md/raid5: Fix spelling mistakes in comments Fix spelling of 'waitting' in comments. Signed-off-by: XU pengfei Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index f4e1cc1ece43..058d82e7fa13 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1327,9 +1327,9 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * superblock is updated to new log tail. Updating superblock (either * directly call md_update_sb() or depend on md thread) must hold * reconfig mutex. On the other hand, raid5_quiesce is called with - * reconfig_mutex hold. The first step of raid5_quiesce() is waitting - * for all IO finish, hence waitting for reclaim thread, while reclaim - * thread is calling this function and waitting for reconfig mutex. So + * reconfig_mutex hold. The first step of raid5_quiesce() is waiting + * for all IO finish, hence waiting for reclaim thread, while reclaim + * thread is calling this function and waiting for reconfig mutex. So * there is a deadlock. We workaround this issue with a trylock. * FIXME: we could miss discard if we can't take reconfig mutex */ -- cgit v1.2.3 From 62bca04bb7dd7eaa5c2daf36b1ca9ab8a1fb71a2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 22 Aug 2022 15:45:39 +0800 Subject: md/raid10: fix compile warning With W=1, compiler complains. drivers/md/raid10.c:1983: warning: bad line: Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9117fcdee1be..077c7cdefcd4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1980,7 +1980,7 @@ static int enough(struct r10conf *conf, int ignore) * Otherwise, it must be degraded: * - recovery is interrupted. * - &mddev->degraded is bumped. - + * * @rdev is marked as &Faulty excluding case when array is failed and * &mddev->fail_last_dev is off. */ -- cgit v1.2.3 From 1727fd5015d8f93474148f94e34cda5aa6ad4a43 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Tue, 23 Aug 2022 11:51:04 -0700 Subject: md: Replace snprintf with scnprintf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current code produces a warning as shown below when total characters in the constituent block device names plus the slashes exceeds 200. snprintf() returns the number of characters generated from the given input, which could cause the expression “200 – len” to wrap around to a large positive number. Fix this by using scnprintf() instead, which returns the actual number of characters written into the buffer. [ 1513.267938] ------------[ cut here ]------------ [ 1513.267943] WARNING: CPU: 15 PID: 37247 at /lib/vsprintf.c:2509 vsnprintf+0x2c8/0x510 [ 1513.267944] Modules linked in: [ 1513.267969] CPU: 15 PID: 37247 Comm: mdadm Not tainted 5.4.0-1085-azure #90~18.04.1-Ubuntu [ 1513.267969] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 05/09/2022 [ 1513.267971] RIP: 0010:vsnprintf+0x2c8/0x510 <-snip-> [ 1513.267982] Call Trace: [ 1513.267986] snprintf+0x45/0x70 [ 1513.267990] ? disk_name+0x71/0xa0 [ 1513.267993] dump_zones+0x114/0x240 [raid0] [ 1513.267996] ? _cond_resched+0x19/0x40 [ 1513.267998] raid0_run+0x19e/0x270 [raid0] [ 1513.268000] md_run+0x5e0/0xc50 [ 1513.268003] ? security_capable+0x3f/0x60 [ 1513.268005] do_md_run+0x19/0x110 [ 1513.268006] md_ioctl+0x195e/0x1f90 [ 1513.268007] blkdev_ioctl+0x91f/0x9f0 [ 1513.268010] block_ioctl+0x3d/0x50 [ 1513.268012] do_vfs_ioctl+0xa9/0x640 [ 1513.268014] ? __fput+0x162/0x260 [ 1513.268016] ksys_ioctl+0x75/0x80 [ 1513.268017] __x64_sys_ioctl+0x1a/0x20 [ 1513.268019] do_syscall_64+0x5e/0x200 [ 1513.268021] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 766038846e875 ("md/raid0: replace printk() with pr_*()") Reviewed-by: Michael Kelley Acked-by: Guoqing Jiang Signed-off-by: Saurabh Sengar Signed-off-by: Song Liu --- drivers/md/raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 78addfe4a0c9..857c49399c28 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -47,7 +47,7 @@ static void dump_zones(struct mddev *mddev) int len = 0; for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - len += snprintf(line+len, 200-len, "%s%pg", k?"/":"", + len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"", conf->devlist[j * raid_disks + k]->bdev); pr_debug("md: zone%d=[%s]\n", j, line); -- cgit v1.2.3 From b6d56144fe902c2b7b9a3573aaf6aa7dc5366211 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 11 Aug 2022 11:14:14 -0600 Subject: md/raid5: Refactor raid5_get_active_stripe() Refactor raid5_get_active_stripe() without the gotos with an explicit infinite loop and some additional nesting. Suggested-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 82 +++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31a0cbf63384..e7a7ca37ed1a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -811,54 +811,54 @@ static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, spin_lock_irq(conf->hash_locks + hash); -retry: - if (!noquiesce && conf->quiesce) { - /* - * Must release the reference to batch_last before waiting, - * on quiesce, otherwise the batch_last will hold a reference - * to a stripe and raid5_quiesce() will deadlock waiting for - * active_stripes to go to zero. - */ - if (ctx && ctx->batch_last) { - raid5_release_stripe(ctx->batch_last); - ctx->batch_last = NULL; - } - - wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, - *(conf->hash_locks + hash)); - } + for (;;) { + if (!noquiesce && conf->quiesce) { + /* + * Must release the reference to batch_last before + * waiting, on quiesce, otherwise the batch_last will + * hold a reference to a stripe and raid5_quiesce() + * will deadlock waiting for active_stripes to go to + * zero. + */ + if (ctx && ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; + } - sh = find_get_stripe(conf, sector, conf->generation - previous, hash); - if (sh) - goto out; + wait_event_lock_irq(conf->wait_for_quiescent, + !conf->quiesce, + *(conf->hash_locks + hash)); + } - if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) - goto wait_for_stripe; + sh = find_get_stripe(conf, sector, conf->generation - previous, + hash); + if (sh) + break; - sh = get_free_stripe(conf, hash); - if (sh) { - r5c_check_stripe_cache_usage(conf); - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - goto out; - } + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { + sh = get_free_stripe(conf, hash); + if (sh) { + r5c_check_stripe_cache_usage(conf); + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + break; + } - if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) - set_bit(R5_ALLOC_MORE, &conf->cache_state); + if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + } -wait_for_stripe: - if (noblock) - goto out; + if (noblock) + break; - set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq(conf->wait_for_stripe, - is_inactive_blocked(conf, hash), - *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - goto retry; + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, + is_inactive_blocked(conf, hash), + *(conf->hash_locks + hash)); + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + } -out: spin_unlock_irq(conf->hash_locks + hash); return sh; } -- cgit v1.2.3 From 9892fa993f8a8e716f39266b24d6218d8333ff89 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 11 Aug 2022 11:14:15 -0600 Subject: md/raid5: Drop extern on function declarations in raid5.h externs should not be used in function declarations, so clean those up. Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a5082bed83c8..4be2feb9e74a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -803,16 +803,14 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx) } #endif -extern void md_raid5_kick_device(struct r5conf *conf); -extern int raid5_set_cache_size(struct mddev *mddev, int size); -extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); -extern void raid5_release_stripe(struct stripe_head *sh); -extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh); -extern struct stripe_head * -raid5_get_active_stripe(struct r5conf *conf, sector_t sector, - bool previous, bool noblock, bool noquiesce); -extern int raid5_calc_degraded(struct r5conf *conf); -extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); +void md_raid5_kick_device(struct r5conf *conf); +int raid5_set_cache_size(struct mddev *mddev, int size); +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +void raid5_release_stripe(struct stripe_head *sh); +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, struct stripe_head *sh); +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, + sector_t sector, bool previous, bool noblock, bool noquiesce); +int raid5_calc_degraded(struct r5conf *conf); +int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif -- cgit v1.2.3 From 2f2d51efd83225c1eb0d7771ddfe9fddd5ccd378 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 11 Aug 2022 11:14:16 -0600 Subject: md/raid5: Cleanup prototype of raid5_get_active_stripe() Drop the three bools in the prototype of raid5_get_active_stripe() and replace them with a flags parameter. At the same time, drop the distinction with __raid5_get_active_stripe(). Suggested-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 3 ++- drivers/md/raid5.c | 49 +++++++++++++++++++++++++----------------------- drivers/md/raid5.h | 12 +++++++++++- 3 files changed, 39 insertions(+), 25 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 058d82e7fa13..8e3ca60b5a58 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1923,7 +1923,8 @@ r5c_recovery_alloc_stripe( { struct stripe_head *sh; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); + sh = raid5_get_active_stripe(conf, NULL, stripe_sect, + noblock ? R5_GAS_NOBLOCK : 0); if (!sh) return NULL; /* no more stripe available */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e7a7ca37ed1a..84f941843364 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -800,19 +800,20 @@ static bool is_inactive_blocked(struct r5conf *conf, int hash) return active < (conf->max_nr_stripes * 3 / 4); } -static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, +struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t sector, - bool previous, bool noblock, bool noquiesce) + unsigned int flags) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); + int previous = !!(flags & R5_GAS_PREVIOUS); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(conf->hash_locks + hash); for (;;) { - if (!noquiesce && conf->quiesce) { + if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { /* * Must release the reference to batch_last before * waiting, on quiesce, otherwise the batch_last will @@ -848,7 +849,7 @@ static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, set_bit(R5_ALLOC_MORE, &conf->cache_state); } - if (noblock) + if (flags & R5_GAS_NOBLOCK) break; set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); @@ -863,13 +864,6 @@ static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, return sh; } -struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, - sector_t sector, bool previous, bool noblock, bool noquiesce) -{ - return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock, - noquiesce); -} - static bool is_full_stripe_write(struct stripe_head *sh) { BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); @@ -4636,7 +4630,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); + sh2 = raid5_get_active_stripe(conf, NULL, s, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -5273,7 +5268,9 @@ static void handle_stripe(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src - = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); + = raid5_get_active_stripe(conf, NULL, sh->sector, + R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | + R5_GAS_NOQUIESCE); if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { /* sh cannot be written until sh_src has been read. * so arrange for sh to be delayed a little @@ -5823,7 +5820,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) DEFINE_WAIT(w); int d; again: - sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); + sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); @@ -5978,7 +5975,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, enum stripe_result ret; struct stripe_head *sh; sector_t new_sector; - int previous = 0; + int previous = 0, flags = 0; int seq, dd_idx; seq = read_seqcount_begin(&conf->gen_lock); @@ -6012,8 +6009,11 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, new_sector, logical_sector); - sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); + if (previous) + flags |= R5_GAS_PREVIOUS; + if (bi->bi_opf & REQ_RAHEAD) + flags |= R5_GAS_NOBLOCK; + sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); if (unlikely(!sh)) { /* cannot get stripe, just give-up */ bi->bi_status = BLK_STS_IOERR; @@ -6362,7 +6362,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; - sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, + R5_GAS_NOQUIESCE); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -6411,7 +6412,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); + sh = raid5_get_active_stripe(conf, NULL, first_sector, + R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); @@ -6531,9 +6533,10 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); - sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); + sh = raid5_get_active_stripe(conf, NULL, sector_nr, + R5_GAS_NOBLOCK); if (sh == NULL) { - sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); + sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -6596,8 +6599,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, /* already done this stripe */ continue; - sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); - + sh = raid5_get_active_stripe(conf, NULL, sector, + R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); if (!sh) { /* failed to get a stripe - must wait */ conf->retry_read_aligned = raid_bio; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4be2feb9e74a..e873938a6125 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -809,8 +809,18 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); void raid5_release_stripe(struct stripe_head *sh); sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, int previous, int *dd_idx, struct stripe_head *sh); + +struct stripe_request_ctx; +/* get stripe from previous generation (when reshaping) */ +#define R5_GAS_PREVIOUS (1 << 0) +/* do not block waiting for a free stripe */ +#define R5_GAS_NOBLOCK (1 << 1) +/* do not block waiting for quiesce to be released */ +#define R5_GAS_NOQUIESCE (1 << 2) struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, - sector_t sector, bool previous, bool noblock, bool noquiesce); + struct stripe_request_ctx *ctx, sector_t sector, + unsigned int flags); + int raid5_calc_degraded(struct r5conf *conf); int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #endif -- cgit v1.2.3 From f9287c3e93f00d3236c4c81bf76dae43afd903b9 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 11 Aug 2022 11:14:17 -0600 Subject: md/raid5: Don't read ->active_stripes if it's not needed The atomic_read() is not needed in many cases so only do the read after the first checks are done. Suggested-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 84f941843364..e0514bda7695 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -789,15 +789,14 @@ struct stripe_request_ctx { */ static bool is_inactive_blocked(struct r5conf *conf, int hash) { - int active = atomic_read(&conf->active_stripes); - if (list_empty(conf->inactive_list + hash)) return false; if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return true; - return active < (conf->max_nr_stripes * 3 / 4); + return (atomic_read(&conf->active_stripes) < + (conf->max_nr_stripes * 3 / 4)); } struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, -- cgit v1.2.3 From e2eed85bc75138a9eeb63863d20f8904ac42a577 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 25 Aug 2022 09:46:27 -0600 Subject: md/raid5: Ensure stripe_fill happens on non-read IO with journal When doing degrade/recover tests using the journal a kernel BUG is hit at drivers/md/raid5.c:4381 in handle_parity_checks5(): BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); This was found to occur because handle_stripe_fill() was skipped for stripes in the journal due to a condition in that function. Thus blocks were not fetched and R5_UPTODATE was not set when the code reached handle_parity_checks5(). To fix this, don't skip handle_stripe_fill() unless the stripe is for read. Fixes: 07e83364845e ("md/r5cache: shift complex rmw from read path to write path") Link: https://lore.kernel.org/linux-raid/e05c4239-41a9-d2f7-3cfa-4aa9d2cea8c1@deltatee.com/ Suggested-by: Song Liu Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e0514bda7695..4e6d865a6456 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4040,7 +4040,7 @@ static void handle_stripe_fill(struct stripe_head *sh, * back cache (prexor with orig_page, and then xor with * page) in the read path */ - if (s->injournal && s->failed) { + if (s->to_read && s->injournal && s->failed) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); goto out; -- cgit v1.2.3 From c66a6f41e09ad386fd2cce22b9cded837bbbc704 Mon Sep 17 00:00:00 2001 From: David Sloan Date: Thu, 8 Sep 2022 10:15:14 -0600 Subject: md/raid5: Remove unnecessary bio_put() in raid5_read_one_chunk() When running chunk-sized reads on disks with badblocks duplicate bio free/puts are observed: ============================================================================= BUG bio-200 (Not tainted): Object already free ----------------------------------------------------------------------------- Allocated in mempool_alloc_slab+0x17/0x20 age=3 cpu=2 pid=7504 __slab_alloc.constprop.0+0x5a/0xb0 kmem_cache_alloc+0x31e/0x330 mempool_alloc_slab+0x17/0x20 mempool_alloc+0x100/0x2b0 bio_alloc_bioset+0x181/0x460 do_mpage_readpage+0x776/0xd00 mpage_readahead+0x166/0x320 blkdev_readahead+0x15/0x20 read_pages+0x13f/0x5f0 page_cache_ra_unbounded+0x18d/0x220 force_page_cache_ra+0x181/0x1c0 page_cache_sync_ra+0x65/0xb0 filemap_get_pages+0x1df/0xaf0 filemap_read+0x1e1/0x700 blkdev_read_iter+0x1e5/0x330 vfs_read+0x42a/0x570 Freed in mempool_free_slab+0x17/0x20 age=3 cpu=2 pid=7504 kmem_cache_free+0x46d/0x490 mempool_free_slab+0x17/0x20 mempool_free+0x66/0x190 bio_free+0x78/0x90 bio_put+0x100/0x1a0 raid5_make_request+0x2259/0x2450 md_handle_request+0x402/0x600 md_submit_bio+0xd9/0x120 __submit_bio+0x11f/0x1b0 submit_bio_noacct_nocheck+0x204/0x480 submit_bio_noacct+0x32e/0xc70 submit_bio+0x98/0x1a0 mpage_readahead+0x250/0x320 blkdev_readahead+0x15/0x20 read_pages+0x13f/0x5f0 page_cache_ra_unbounded+0x18d/0x220 Slab 0xffffea000481b600 objects=21 used=0 fp=0xffff8881206d8940 flags=0x17ffffc0010201(locked|slab|head|node=0|zone=2|lastcpupid=0x1fffff) CPU: 0 PID: 34525 Comm: kworker/u24:2 Not tainted 6.0.0-rc2-localyes-265166-gf11c5343fa3f #143 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: raid5wq raid5_do_work Call Trace: dump_stack_lvl+0x5a/0x78 dump_stack+0x10/0x16 print_trailer+0x158/0x165 object_err+0x35/0x50 free_debug_processing.cold+0xb7/0xbe __slab_free+0x1ae/0x330 kmem_cache_free+0x46d/0x490 mempool_free_slab+0x17/0x20 mempool_free+0x66/0x190 bio_free+0x78/0x90 bio_put+0x100/0x1a0 mpage_end_io+0x36/0x150 bio_endio+0x2fd/0x360 md_end_io_acct+0x7e/0x90 bio_endio+0x2fd/0x360 handle_failed_stripe+0x960/0xb80 handle_stripe+0x1348/0x3760 handle_active_stripes.constprop.0+0x72a/0xaf0 raid5_do_work+0x177/0x330 process_one_work+0x616/0xb20 worker_thread+0x2bd/0x6f0 kthread+0x179/0x1b0 ret_from_fork+0x22/0x30 The double free is caused by an unnecessary bio_put() in the if(is_badblock(...)) error path in raid5_read_one_chunk(). The error path was moved ahead of bio_alloc_clone() in c82aa1b76787c ("md/raid5: move checking badblock before clone bio in raid5_read_one_chunk"). The previous code checked and freed align_bio which required a bio_put. After the move that is no longer needed as raid_bio is returned to the control of the common io path which performs its own endio resulting in a double free on bad device blocks. Fixes: c82aa1b76787c ("md/raid5: move checking badblock before clone bio in raid5_read_one_chunk") Signed-off-by: David Sloan Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4e6d865a6456..734f92e75f85 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5538,7 +5538,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, &bad_sectors)) { - bio_put(raid_bio); rdev_dec_pending(rdev, mddev); return 0; } -- cgit v1.2.3 From 3bfc3bcd787c48aa31e4fde4a6dfcef4cd7ee2c2 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 8 Sep 2022 10:15:15 -0600 Subject: md: Remove extra mddev_get() in md_seq_start() A regression is seen where mddev devices stay permanently after they are stopped due to an elevated reference count. This was tracked down to an extra mddev_get() in md_seq_start(). It only happened rarely because most of the time the md_seq_start() is called with a zero offset. The path with an extra mddev_get() only happens when it starts with a non-zero offset. The commit noted below changed an mddev_get() to check its success but inadvertently left the original call in. Remove the extra call. Fixes: 12a6caf27324 ("md: only delete entries from all_mddevs when the disk is freed") Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index afaf36b2f6ab..9dc0175280b4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8154,7 +8154,6 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) list_for_each(tmp,&all_mddevs) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); - mddev_get(mddev); if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); -- cgit v1.2.3 From ed2e063f92c44c891ccd883e289dde6ca870edcc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 19:34:24 +0800 Subject: md/raid10: factor out code from wait_barrier() to stop_waiting_barrier() Currently the nasty condition in wait_barrier() is hard to read. This patch factors out the condition into a function. There are no functional changes. Signed-off-by: Yu Kuai Acked-by: Paul Menzel Reviewed-by: Logan Gunthorpe Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 077c7cdefcd4..310a6132304f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -957,41 +957,47 @@ static void lower_barrier(struct r10conf *conf) wake_up(&conf->wait_barrier); } +static bool stop_waiting_barrier(struct r10conf *conf) +{ + struct bio_list *bio_list = current->bio_list; + + /* barrier is dropped */ + if (!conf->barrier) + return true; + + /* + * If there are already pending requests (preventing the barrier from + * rising completely), and the pre-process bio queue isn't empty, then + * don't wait, as we need to empty that queue to get the nr_pending + * count down. + */ + if (atomic_read(&conf->nr_pending) && bio_list && + (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) + return true; + + /* move on if recovery thread is blocked by us */ + if (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && + conf->nr_queued > 0) + return true; + + return false; +} + static bool wait_barrier(struct r10conf *conf, bool nowait) { bool ret = true; spin_lock_irq(&conf->resync_lock); if (conf->barrier) { - struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * pre-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. - */ /* Return false when nowait flag is set */ if (nowait) { ret = false; } else { raid10_log(conf->mddev, "wait barrier"); wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - bio_list && - (!bio_list_empty(&bio_list[0]) || - !bio_list_empty(&bio_list[1]))) || - /* move on if recovery thread is - * blocked by us - */ - (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, - &conf->mddev->recovery) && - conf->nr_queued > 0), + stop_waiting_barrier(conf), conf->resync_lock); } conf->nr_waiting--; -- cgit v1.2.3 From 0de57e541bb4207c0602eca271c6531c305e9c5d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 19:34:25 +0800 Subject: md/raid10: don't modify 'nr_waitng' in wait_barrier() for the case nowait For the case nowait in wait_barrier(), there is no point to increase nr_waiting and then decrease it. Signed-off-by: Yu Kuai Reviewed-by: Logan Gunthorpe Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 310a6132304f..834a34274976 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -990,17 +990,17 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) spin_lock_irq(&conf->resync_lock); if (conf->barrier) { - conf->nr_waiting++; /* Return false when nowait flag is set */ if (nowait) { ret = false; } else { + conf->nr_waiting++; raid10_log(conf->mddev, "wait barrier"); wait_event_lock_irq(conf->wait_barrier, stop_waiting_barrier(conf), conf->resync_lock); + conf->nr_waiting--; } - conf->nr_waiting--; if (!conf->nr_waiting) wake_up(&conf->wait_barrier); } -- cgit v1.2.3 From 0c0be98bbe67662a7d2bf8381106bfca0e31ed72 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 19:34:26 +0800 Subject: md/raid10: prevent unnecessary calls to wake_up() in fast path Currently, wake_up() is called unconditionally in fast path such as raid10_make_request(), which will cause lock contention under high concurrency: raid10_make_request wake_up __wake_up_common_lock spin_lock_irqsave Improve performance by only call wake_up() if waitqueue is not empty in allow_barrier() and raid10_make_request(). Signed-off-by: Yu Kuai Reviewed-by: Logan Gunthorpe Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 834a34274976..461c8a79fb99 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -274,6 +274,12 @@ static void put_buf(struct r10bio *r10_bio) lower_barrier(conf); } +static void wake_up_barrier(struct r10conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void reschedule_retry(struct r10bio *r10_bio) { unsigned long flags; @@ -1015,7 +1021,7 @@ static void allow_barrier(struct r10conf *conf) { if ((atomic_dec_and_test(&conf->nr_pending)) || (conf->array_freeze_pending)) - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static void freeze_array(struct r10conf *conf, int extra) @@ -1891,7 +1897,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) __make_request(mddev, bio, sectors); /* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); return true; } -- cgit v1.2.3 From 4f350284a7306b3dff676caeafd3faf1b5c068d5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 19:34:27 +0800 Subject: md/raid10: fix improper BUG_ON() in raise_barrier() 'conf->barrier' is protected by 'conf->resync_lock', reading 'conf->barrier' without holding the lock is wrong. Signed-off-by: Yu Kuai Reviewed-by: Logan Gunthorpe Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 461c8a79fb99..2970a73d9f5c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -936,8 +936,8 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { - BUG_ON(force && !conf->barrier); spin_lock_irq(&conf->resync_lock); + BUG_ON(force && !conf->barrier); /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, -- cgit v1.2.3 From b9b083f9044abf89f3391fbc196ddece68ac9dba Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 19:34:28 +0800 Subject: md/raid10: convert resync_lock to use seqlock Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', and io can't be dispatched until 'barrier' is dropped. Since holding the 'barrier' is not common, convert 'resync_lock' to use seqlock so that holding lock can be avoided in fast path. Signed-off-by: Yu Kuai Reviewed-and-Tested-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid10.c | 87 +++++++++++++++++++++++++++++++++++------------------ drivers/md/raid10.h | 2 +- 2 files changed, 59 insertions(+), 30 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2970a73d9f5c..58c711912875 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -79,6 +79,21 @@ static void end_reshape(struct r10conf *conf); #include "raid1-10.c" +#define NULL_CMD +#define cmd_before(conf, cmd) \ + do { \ + write_sequnlock_irq(&(conf)->resync_lock); \ + cmd; \ + } while (0) +#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) + +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ + cmd_after(conf)) + +#define wait_event_barrier(conf, cond) \ + wait_event_barrier_cmd(conf, cond, NULL_CMD) + /* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -936,30 +951,29 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); BUG_ON(force && !conf->barrier); /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + wait_event_barrier(conf, force || !conf->nr_waiting); /* block any new IO from starting */ - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void lower_barrier(struct r10conf *conf) { unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -990,11 +1004,31 @@ static bool stop_waiting_barrier(struct r10conf *conf) return false; } +static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = read_seqbegin(&conf->resync_lock); + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqretry(&conf->resync_lock, seq)) + return true; + + if (atomic_dec_and_test(&conf->nr_pending)) + wake_up_barrier(conf); + + return false; +} + static bool wait_barrier(struct r10conf *conf, bool nowait) { bool ret = true; - spin_lock_irq(&conf->resync_lock); + if (wait_barrier_nolock(conf)) + return true; + + write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { /* Return false when nowait flag is set */ if (nowait) { @@ -1002,9 +1036,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) } else { conf->nr_waiting++; raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - stop_waiting_barrier(conf), - conf->resync_lock); + wait_event_barrier(conf, stop_waiting_barrier(conf)); conf->nr_waiting--; } if (!conf->nr_waiting) @@ -1013,7 +1045,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) /* Only increment nr_pending when we wait */ if (ret) atomic_inc(&conf->nr_pending); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); return ret; } @@ -1038,27 +1070,24 @@ static void freeze_array(struct r10conf *conf, int extra) * must match the number of pending IOs (nr_pending) before * we continue. */ - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); conf->array_freeze_pending++; - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); - + wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == + conf->nr_queued + extra, flush_pending_writes(conf)); conf->array_freeze_pending--; - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); conf->nr_waiting--; wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static sector_t choose_data_offset(struct r10bio *r10_bio, @@ -4045,7 +4074,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); - spin_lock_init(&conf->resync_lock); + seqlock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0); @@ -4364,7 +4393,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + WRITE_ONCE(conf->barrier, 1); } return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5c0804d8bb1f..8c072ce0bc54 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -76,7 +76,7 @@ struct r10conf { /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - spinlock_t resync_lock; + seqlock_t resync_lock; atomic_t nr_pending; int nr_waiting; int nr_queued; -- cgit v1.2.3 From 5e2cf333b7bd5d3e62595a44d598a254c697cd74 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 21 Sep 2022 10:28:37 -0600 Subject: md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d A complicated deadlock exists when using the journal and an elevated group_thrtead_cnt. It was found with loop devices, but its not clear whether it can be seen with real disks. The deadlock can occur simply by writing data with an fio script. When the deadlock occurs, multiple threads will hang in different ways: 1) The group threads will hang in the blk-wbt code with bios waiting to be submitted to the block layer: io_schedule+0x70/0xb0 rq_qos_wait+0x153/0x210 wbt_wait+0x115/0x1b0 io_schedule+0x70/0xb0 rq_qos_wait+0x153/0x210 wbt_wait+0x115/0x1b0 __rq_qos_throttle+0x38/0x60 blk_mq_submit_bio+0x589/0xcd0 wbt_wait+0x115/0x1b0 __rq_qos_throttle+0x38/0x60 blk_mq_submit_bio+0x589/0xcd0 __submit_bio+0xe6/0x100 submit_bio_noacct_nocheck+0x42e/0x470 submit_bio_noacct+0x4c2/0xbb0 ops_run_io+0x46b/0x1a30 handle_stripe+0xcd3/0x36b0 handle_active_stripes.constprop.0+0x6f6/0xa60 raid5_do_work+0x177/0x330 Or: io_schedule+0x70/0xb0 rq_qos_wait+0x153/0x210 wbt_wait+0x115/0x1b0 __rq_qos_throttle+0x38/0x60 blk_mq_submit_bio+0x589/0xcd0 __submit_bio+0xe6/0x100 submit_bio_noacct_nocheck+0x42e/0x470 submit_bio_noacct+0x4c2/0xbb0 flush_deferred_bios+0x136/0x170 raid5_do_work+0x262/0x330 2) The r5l_reclaim thread will hang in the same way, submitting a bio to the block layer: io_schedule+0x70/0xb0 rq_qos_wait+0x153/0x210 wbt_wait+0x115/0x1b0 __rq_qos_throttle+0x38/0x60 blk_mq_submit_bio+0x589/0xcd0 __submit_bio+0xe6/0x100 submit_bio_noacct_nocheck+0x42e/0x470 submit_bio_noacct+0x4c2/0xbb0 submit_bio+0x3f/0xf0 md_super_write+0x12f/0x1b0 md_update_sb.part.0+0x7c6/0xff0 md_update_sb+0x30/0x60 r5l_do_reclaim+0x4f9/0x5e0 r5l_reclaim_thread+0x69/0x30b However, before hanging, the MD_SB_CHANGE_PENDING flag will be set for sb_flags in r5l_write_super_and_discard_space(). This flag will never be cleared because the submit_bio() call never returns. 3) Due to the MD_SB_CHANGE_PENDING flag being set, handle_stripe() will do no processing on any pending stripes and re-set STRIPE_HANDLE. This will cause the raid5d thread to enter an infinite loop, constantly trying to handle the same stripes stuck in the queue. The raid5d thread has a blk_plug that holds a number of bios that are also stuck waiting seeing the thread is in a loop that never schedules. These bios have been accounted for by blk-wbt thus preventing the other threads above from continuing when they try to submit bios. --Deadlock. To fix this, add the same wait_event() that is used in raid5_do_work() to raid5d() such that if MD_SB_CHANGE_PENDING is set, the thread will schedule and wait until the flag is cleared. The schedule action will flush the plug which will allow the r5l_reclaim thread to continue, thus preventing the deadlock. However, md_check_recovery() calls can also clear MD_SB_CHANGE_PENDING from the same thread and can thus deadlock if the thread is put to sleep. So avoid waiting if md_check_recovery() is being called in the loop. It's not clear when the deadlock was introduced, but the similar wait_event() call in raid5_do_work() was added in 2017 by this commit: 16d997b78b15 ("md/raid5: simplfy delaying of writes while metadata is updated.") Link: https://lore.kernel.org/r/7f3b87b6-b52a-f737-51d7-a4eec5c44112@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 734f92e75f85..7b820b81d8c2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,6 +36,7 @@ */ #include +#include #include #include #include @@ -6782,7 +6783,18 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); -- cgit v1.2.3 From 65b94b527dfcb700b84d043c5bdf2924663724e7 Mon Sep 17 00:00:00 2001 From: Zhou nan Date: Mon, 19 Sep 2022 21:36:45 -0400 Subject: md: Fix spelling mistake in comments of r5l_log Fix spelling of dones't in comments. Signed-off-by: Zhou nan Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 8e3ca60b5a58..79c73330020b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -125,7 +125,7 @@ struct r5l_log { * reclaimed. if it's 0, reclaim spaces * used by io_units which are in * IO_UNIT_STRIPE_END state (eg, reclaim - * dones't wait for specific io_unit + * doesn't wait for specific io_unit * switching to IO_UNIT_STRIPE_END * state) */ wait_queue_head_t iounit_wait; -- cgit v1.2.3 From 568ec936bf1384fc15873908c96a9aeb62536edb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 09:58:15 +0200 Subject: block: replace blk_queue_nowait with bdev_nowait Replace blk_queue_nowait with a bdev_nowait helpers that takes the block_device given that the I/O submission path should not have to look into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220927075815.269694-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/md/dm-table.c | 4 +--- drivers/md/md.c | 4 ++-- include/linux/blkdev.h | 6 +++++- io_uring/io_uring.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/block/blk-core.c b/block/blk-core.c index 052444c9b594..dc1fa454ae30 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -713,7 +713,7 @@ void submit_bio_noacct(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 332f96b58252..d8034ff0cb24 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1856,9 +1856,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_nowait(q); + return !bdev_nowait(dev->bdev); } static bool dm_table_supports_nowait(struct dm_table *t) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9dc0175280b4..3e9a1a00776b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5844,7 +5844,7 @@ int md_run(struct mddev *mddev) } } sysfs_notify_dirent_safe(rdev->sysfs_state); - nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); + nowait = nowait && bdev_nowait(rdev->bdev); } if (!bioset_initialized(&mddev->bio_set)) { @@ -6980,7 +6980,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) * If the new disk does not support REQ_NOWAIT, * disable on the whole MD. */ - if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { + if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b13fdd34a7..4750772ef228 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) -#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); @@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev) return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); } +static inline bool bdev_nowait(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ebfdb2212ec2..b2c80c2aa431 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1377,7 +1377,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) static bool io_bdev_nowait(struct block_device *bdev) { - return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); + return !bdev || bdev_nowait(bdev); } /* -- cgit v1.2.3