From e6c4438ba7cb615448492849970aaf0aaa1cc973 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 8 May 2015 10:51:30 -0700 Subject: blk-mq: fix plugging in blk_sq_make_request The following appears in blk_sq_make_request: /* * If we have multiple hardware queues, just go directly to * one of those for sync IO. */ We clearly don't have multiple hardware queues, here! This comment was introduced with this commit 07068d5b8e (blk-mq: split make request handler for multi and single queue): We want slightly different behavior from them: - On single queue devices, we currently use the per-process plug for deferred IO and for merging. - On multi queue devices, we don't use the per-process plug, but we want to go straight to hardware for SYNC IO. The old code had this: use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); and that was converted to: use_plug = !is_flush_fua && !is_sync; which is not equivalent. For the single queue case, that second half of the && expression is always true. So, what I think was actually inteded follows (and this more closely matches what is done in blk_queue_bio). V2: delete the 'likely', which should not be a big deal Signed-off-by: Jeff Moyer Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index ade8a2d1b0aa..a65acffde19a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1309,16 +1309,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; - blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { @@ -1326,7 +1321,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && !blk_queue_nomerges(q) && + if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count)) return; @@ -1345,21 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; - - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { -- cgit v1.2.3 From 239ad215f0d8388cbe6c09a0fab8ad8ff5dba420 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:31 -0700 Subject: blk-mq: avoid re-initialize request which is failed in direct dispatch If we directly issue a request and it fails, we use blk_mq_merge_queue_io(). But we already assigned bio to a request in blk_mq_bio_to_request. blk_mq_merge_queue_io shouldn't run blk_mq_bio_to_request again. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a65acffde19a..f13d0de42f53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1284,6 +1284,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_end_request(rq, rq->errors); goto done; } + blk_mq_insert_request(rq, false, true, true); + return; } } -- cgit v1.2.3 From f984df1f0f71ef96254411fc3576a10ae561be71 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:32 -0700 Subject: blk-mq: do limited block plug for multiple queue case plug is still helpful for workload with IO merge, but it can be harmful otherwise especially with multiple hardware queues, as there is (supposed) no lock contention in this case and plug can introduce latency. For multiple queues, we do limited plug, eg plug only if there is request merge. If a request doesn't have merge with following request, the requet will be dispatched immediately. V2: check blk_queue_nomerges() as suggested by Jeff. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 82 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 23 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f13d0de42f53..902c2eb9a0e7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,6 +1224,38 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) + return 0; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + return -1; + } +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to @@ -1235,6 +1267,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; blk_queue_bounce(q, &bio); @@ -1243,6 +1277,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) return; @@ -1253,40 +1291,39 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * we do limited pluging. If bio can be merged, do merge. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + if (!list_empty(&plug->mq_list)) { + old_rq = list_first_entry(&plug->mq_list, + struct request, queuelist); + list_del_init(&old_rq->queuelist); } - blk_mq_insert_request(rq, false, true, true); + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) return; - } + if (!blk_mq_direct_issue_request(old_rq)) + return; + blk_mq_insert_request(old_rq, false, true, true); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1299,7 +1336,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); } -- cgit v1.2.3 From 5b3f341f098d60da2970758db6a05bd851eb6b39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:33 -0700 Subject: blk-mq: make plug work for mutiple disks and queues Last patch makes plug work for multiple queue case. However it only works for single disk case, because it assumes only one request in the plug list. If a task is accessing multiple disks, eg MD/DM, the assumption is wrong. Let blk_attempt_plug_merge() record request from the same queue. V2: use NULL parameter in !mq case. Fix a bug. Add comments in blk_attempt_plug_merge to make it less (hopefully) confusion. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 15 ++++++++++++--- block/blk-mq.c | 14 +++++++++----- block/blk.h | 3 ++- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index 9dcfb8ec554b..f0be754c7781 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1522,7 +1522,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -1542,8 +1543,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1608,7 +1617,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * any locks. */ if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 902c2eb9a0e7..31df47443699 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1269,6 +1269,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; + struct request *same_queue_rq = NULL; blk_queue_bounce(q, &bio); @@ -1278,7 +1279,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return; rq = blk_mq_map_request(q, bio, &data); @@ -1309,9 +1310,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * issued. So the plug list will have one request at most */ if (plug) { - if (!list_empty(&plug->mq_list)) { - old_rq = list_first_entry(&plug->mq_list, - struct request, queuelist); + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is empty + **/ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); } list_add_tail(&rq->queuelist, &plug->mq_list); @@ -1360,7 +1364,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; rq = blk_mq_map_request(q, bio, &data); diff --git a/block/blk.h b/block/blk.h index 4b48d55e588e..026d9594142b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); -- cgit v1.2.3 From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2015 09:38:13 +0200 Subject: block: use an atomic_t for mq_freeze_depth lockdep gets unhappy about the not disabling irqs when using the queue_lock around it. Instead of trying to fix that up just switch to an atomic_t and get rid of the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 ++++++++++-------------- include/linux/blkdev.h | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 31df47443699..c382a34fe5ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; if (ret) @@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_hw_queues(q, false); } @@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); } @@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2da818a48097..bc917956a6d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -444,7 +444,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; - int mq_freeze_depth; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; -- cgit v1.2.3 From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 1 Jun 2015 09:29:53 -0600 Subject: blk-mq: Shared tag enhancements Storage controllers may expose multiple block devices that share hardware resources managed by blk-mq. This patch enhances the shared tags so a low-level driver can access the shared resources not tied to the unshared h/w contexts. This way the LLD can dynamically add and delete disks and request queues without having to track all the request_queue hctx's to iterate outstanding tags. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 38 ++++++++++++++++++++++++++++++++++++++ block/blk-mq-tag.h | 1 + block/blk-mq.c | 12 ++++++++++-- include/linux/blk-mq.h | 4 ++++ 4 files changed, 53 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index be3290cc0644..9b6e28830b82 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, } } +static void bt_tags_for_each(struct blk_mq_tags *tags, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_tag_iter_fn *fn, void *data, bool reserved) +{ + struct request *rq; + int bit, i; + + if (!tags->rqs) + return; + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = blk_mq_tag_to_rq(tags, off + bit); + fn(rq, data, reserved); + } + + off += (1 << bt->bits_per_word); + } +} + +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv) +{ + if (tags->nr_reserved_tags) + bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); +} +EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); + void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv) { @@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; + if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { + kfree(tags); + return NULL; + } + tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 90767b370308..75893a34237d 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -44,6 +44,7 @@ struct blk_mq_tags { struct list_head page_list; int alloc_policy; + cpumask_var_t cpumask; }; diff --git a/block/blk-mq.c b/block/blk-mq.c index c382a34fe5ac..ef100fd2cb86 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1525,7 +1525,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; fail: @@ -1821,6 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx = q->mq_ops->map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); + cpumask_set_cpu(i, hctx->tags->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } @@ -2187,6 +2187,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2248,8 +2254,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i; for (i = 0; i < set->nr_hw_queues; i++) { - if (set->tags[i]) + if (set->tags[i]) { blk_mq_free_rq_map(set, set->tags[i], i); + free_cpumask_var(set->tags[i]->cpumask); + } } kfree(set->tags); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2056a99b92f8..37d1602c4f7a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); +typedef void (busy_tag_iter_fn)(struct request *, void *, bool); struct blk_mq_ops { /* @@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); enum { BLK_MQ_UNIQUE_TAG_BITS = 16, @@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); -- cgit v1.2.3