From 3bc5e683c67d94bd839a1da2e796c15847b51b69 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:44 +0200 Subject: bfq: Split shared queues on move between cgroups When bfqq is shared by multiple processes it can happen that one of the processes gets moved to a different cgroup (or just starts submitting IO for different cgroup). In case that happens we need to split the merged bfqq as otherwise we will have IO for multiple cgroups in one bfqq and we will just account IO time to wrong entities etc. Similarly if the bfqq is scheduled to merge with another bfqq but the merge didn't happen yet, cancel the merge as it need not be valid anymore. CC: stable@vger.kernel.org Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'block/bfq-iosched.h') diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3b83e3d1c2e5..a56763045d19 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -979,6 +979,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); -- cgit v1.2.3 From 09f871868080c33992cd6a9b72a5ca49582578fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:48 +0200 Subject: bfq: Track whether bfq_group is still online Track whether bfq_group is still online. We cannot rely on blkcg_gq->online because that gets cleared only after all policies are offlined and we need something that gets updated already under bfqd->lock when we are cleaning up our bfq_group to be able to guarantee that when we see online bfq_group, it will stay online while we are holding bfqd->lock lock. CC: stable@vger.kernel.org Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-7-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 3 ++- block/bfq-iosched.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'block/bfq-iosched.h') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9352f3cc2377..879380c2bc7e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -557,6 +557,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -603,7 +604,6 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct bfq_entity *entity; bfqg = bfq_lookup_bfqg(bfqd, blkcg); - if (unlikely(!bfqg)) return NULL; @@ -979,6 +979,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) put_async_queues: bfq_put_async_queues(bfqd, bfqg); + bfqg->online = false; spin_unlock_irqrestore(&bfqd->lock, flags); /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a56763045d19..4664e2f3e828 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -928,6 +928,8 @@ struct bfq_group { /* reference counter (see comments in bfq_bic_update_cgroup) */ int ref; + /* Is bfq_group still online? */ + bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; -- cgit v1.2.3 From 4e54a2493e582361adc3bfbf06c7d50d19d18837 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:49 +0200 Subject: bfq: Get rid of __bio_blkcg() usage BFQ usage of __bio_blkcg() is a relict from the past. Furthermore if bio would not be associated with any blkcg, the usage of __bio_blkcg() in BFQ is prone to races with the task being migrated between cgroups as __bio_blkcg() calls at different places could return different blkcgs. Convert BFQ to the new situation where bio->bi_blkg is initialized in bio_set_dev() and thus practically always valid. This allows us to save blkcg_gq lookup and noticeably simplify the code. CC: stable@vger.kernel.org Fixes: 0fe061b9f03c ("blkcg: fix ref count issue with bio_blkcg() using task_css") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-8-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 63 +++++++++++++++++++---------------------------------- block/bfq-iosched.c | 11 +--------- block/bfq-iosched.h | 3 +-- 3 files changed, 25 insertions(+), 52 deletions(-) (limited to 'block/bfq-iosched.h') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 879380c2bc7e..32d2c2a47480 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -586,27 +586,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg, entity->sched_data = &parent->sched_data; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct bfq_group *bfqg, *parent; + struct bfq_group *parent; struct bfq_entity *entity; - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - if (unlikely(!bfqg)) - return NULL; - /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet @@ -623,8 +607,15 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, bfq_group_set_parent(curr_bfqg, parent); } } +} - return bfqg; +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (!blkg) + return bfqd->root_group; + return blkg_to_bfqg(blkg); } /** @@ -714,25 +705,15 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) +static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; struct bfq_entity *entity; - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - if (async_bfqq) { entity = &async_bfqq->entity; @@ -784,20 +765,24 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; - rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following @@ -850,8 +835,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); } /** @@ -1469,7 +1452,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d7cf930b47bb..e47c75f1fa0f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5726,14 +5726,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq; struct bfq_group *bfqg; - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - + bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); @@ -5779,8 +5772,6 @@ out: if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); - - rcu_read_unlock(); return bfqq; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 4664e2f3e828..978ef5d6fe6a 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1009,8 +1009,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); -- cgit v1.2.3 From f4a6a61cb6d40d9ae63e47743d33200f3efe3fe7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:16 +0200 Subject: blktrace: cleanup the __trace_note_message interface Pass the cgroup_subsys_state instead of a the blkg so that blktrace doesn't need to poke into blk-cgroup internals, and give the name a blk prefix as the current name is way too generic for a public interface. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 4 ++-- block/blk-throttle.c | 2 +- include/linux/blktrace_api.h | 10 ++++------ kernel/trace/blktrace.c | 20 ++++++++++---------- 4 files changed, 17 insertions(+), 19 deletions(-) (limited to 'block/bfq-iosched.h') diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 978ef5d6fe6a..b18d6c31c225 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1102,13 +1102,13 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); break; \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ + &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ } while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 469c483719be..447e1b8722f7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -227,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ - tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 22501a293fa5..623e22492afa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -27,12 +27,10 @@ struct blk_trace { atomic_t dropped; }; -struct blkcg; - extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern __printf(3, 4) -void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...); +__printf(3, 4) void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream @@ -47,14 +45,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f * NOTE: Can not use 'static inline' due to presence of var args... * **/ -#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ +#define blk_add_cgroup_trace_msg(q, css, fmt, ...) \ do { \ struct blk_trace *bt; \ \ rcu_read_lock(); \ bt = rcu_dereference((q)->blk_trace); \ if (unlikely(bt)) \ - __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ + __blk_trace_note_message(bt, css, fmt, ##__VA_ARGS__);\ rcu_read_unlock(); \ } while (0) #define blk_add_trace_msg(q, fmt, ...) \ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4d5629196d01..9ef349ac49c0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, - const char *fmt, ...) +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; + u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) @@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_id(blkcg->css.cgroup) : 1); -#else - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; #endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__trace_note_message); +EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) @@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, NULL, "%s", msg); + __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; -- cgit v1.2.3 From f950667356ce90a41b446b726d4595a10cb65415 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2022 12:52:29 +0200 Subject: bfq: Relax waker detection for shared queues Currently we look for waker only if current queue has no requests. This makes sense for bfq queues with a single process however for shared queues when there is a larger number of processes the condition that queue has no requests is difficult to meet because often at least one process has some request in flight although all the others are waiting for the waker to do the work and this harms throughput. Relax the "no queued request for bfq queue" condition to "the current task has no queued requests yet". For this, we also need to start tracking number of requests in flight for each task. This patch (together with the following one) restores the performance for dbench with 128 clients that regressed with commit c65e6fd460b4 ("bfq: Do not let waker requests skip proper accounting") because this commit makes requests of wakers properly enter BFQ queues and thus these queues become ineligible for the old waker detection logic. Dbench results: Vanilla 5.18-rc3 5.18-rc3 + revert 5.18-rc3 patched Mean 1237.36 ( 0.00%) 950.16 * 23.21%* 988.35 * 20.12%* Numbers are time to complete workload so lower is better. Fixes: c65e6fd460b4 ("bfq: Do not let waker requests skip proper accounting") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220519105235.31397-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 +++-- block/bfq-iosched.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'block/bfq-iosched.h') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 92f0a829a804..7545f589d8c3 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2129,7 +2129,6 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - bfqq->dispatched > 0 || now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; @@ -2210,7 +2209,7 @@ static void bfq_add_request(struct request *rq) */ WRITE_ONCE(bfqd->queued, bfqd->queued + 1); - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) { bfq_check_waker(bfqd, bfqq, now_ns); /* @@ -6573,6 +6572,7 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); } bfq_finish_requeue_request_body(bfqq); + RQ_BIC(rq)->requests--; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -6806,6 +6806,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq_request_allocated(bfqq); bfqq->ref++; + bic->requests++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index b18d6c31c225..ca8177d7bf7c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -468,6 +468,7 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ + unsigned int requests; /* Number of requests this process has in flight */ }; /** -- cgit v1.2.3