From e6c4438ba7cb615448492849970aaf0aaa1cc973 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 8 May 2015 10:51:30 -0700 Subject: blk-mq: fix plugging in blk_sq_make_request The following appears in blk_sq_make_request: /* * If we have multiple hardware queues, just go directly to * one of those for sync IO. */ We clearly don't have multiple hardware queues, here! This comment was introduced with this commit 07068d5b8e (blk-mq: split make request handler for multi and single queue): We want slightly different behavior from them: - On single queue devices, we currently use the per-process plug for deferred IO and for merging. - On multi queue devices, we don't use the per-process plug, but we want to go straight to hardware for SYNC IO. The old code had this: use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); and that was converted to: use_plug = !is_flush_fua && !is_sync; which is not equivalent. For the single queue case, that second half of the && expression is always true. So, what I think was actually inteded follows (and this more closely matches what is done in blk_queue_bio). V2: delete the 'likely', which should not be a big deal Signed-off-by: Jeff Moyer Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index ade8a2d1b0aa..a65acffde19a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1309,16 +1309,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; - blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { @@ -1326,7 +1321,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && !blk_queue_nomerges(q) && + if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count)) return; @@ -1345,21 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; - - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { -- cgit v1.2.3 From 239ad215f0d8388cbe6c09a0fab8ad8ff5dba420 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:31 -0700 Subject: blk-mq: avoid re-initialize request which is failed in direct dispatch If we directly issue a request and it fails, we use blk_mq_merge_queue_io(). But we already assigned bio to a request in blk_mq_bio_to_request. blk_mq_merge_queue_io shouldn't run blk_mq_bio_to_request again. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a65acffde19a..f13d0de42f53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1284,6 +1284,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_end_request(rq, rq->errors); goto done; } + blk_mq_insert_request(rq, false, true, true); + return; } } -- cgit v1.2.3 From f984df1f0f71ef96254411fc3576a10ae561be71 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:32 -0700 Subject: blk-mq: do limited block plug for multiple queue case plug is still helpful for workload with IO merge, but it can be harmful otherwise especially with multiple hardware queues, as there is (supposed) no lock contention in this case and plug can introduce latency. For multiple queues, we do limited plug, eg plug only if there is request merge. If a request doesn't have merge with following request, the requet will be dispatched immediately. V2: check blk_queue_nomerges() as suggested by Jeff. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 82 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 23 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f13d0de42f53..902c2eb9a0e7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,6 +1224,38 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) + return 0; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + return -1; + } +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to @@ -1235,6 +1267,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; blk_queue_bounce(q, &bio); @@ -1243,6 +1277,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) return; @@ -1253,40 +1291,39 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * we do limited pluging. If bio can be merged, do merge. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + if (!list_empty(&plug->mq_list)) { + old_rq = list_first_entry(&plug->mq_list, + struct request, queuelist); + list_del_init(&old_rq->queuelist); } - blk_mq_insert_request(rq, false, true, true); + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) return; - } + if (!blk_mq_direct_issue_request(old_rq)) + return; + blk_mq_insert_request(old_rq, false, true, true); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1299,7 +1336,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); } -- cgit v1.2.3 From 5b3f341f098d60da2970758db6a05bd851eb6b39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:33 -0700 Subject: blk-mq: make plug work for mutiple disks and queues Last patch makes plug work for multiple queue case. However it only works for single disk case, because it assumes only one request in the plug list. If a task is accessing multiple disks, eg MD/DM, the assumption is wrong. Let blk_attempt_plug_merge() record request from the same queue. V2: use NULL parameter in !mq case. Fix a bug. Add comments in blk_attempt_plug_merge to make it less (hopefully) confusion. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 15 ++++++++++++--- block/blk-mq.c | 14 +++++++++----- block/blk.h | 3 ++- 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index 9dcfb8ec554b..f0be754c7781 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1522,7 +1522,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -1542,8 +1543,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1608,7 +1617,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * any locks. */ if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 902c2eb9a0e7..31df47443699 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1269,6 +1269,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; + struct request *same_queue_rq = NULL; blk_queue_bounce(q, &bio); @@ -1278,7 +1279,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return; rq = blk_mq_map_request(q, bio, &data); @@ -1309,9 +1310,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * issued. So the plug list will have one request at most */ if (plug) { - if (!list_empty(&plug->mq_list)) { - old_rq = list_first_entry(&plug->mq_list, - struct request, queuelist); + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is empty + **/ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); } list_add_tail(&rq->queuelist, &plug->mq_list); @@ -1360,7 +1364,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; rq = blk_mq_map_request(q, bio, &data); diff --git a/block/blk.h b/block/blk.h index 4b48d55e588e..026d9594142b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); -- cgit v1.2.3 From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2015 09:38:13 +0200 Subject: block: use an atomic_t for mq_freeze_depth lockdep gets unhappy about the not disabling irqs when using the queue_lock around it. Instead of trying to fix that up just switch to an atomic_t and get rid of the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 ++++++++++-------------- include/linux/blkdev.h | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 31df47443699..c382a34fe5ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; if (ret) @@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_hw_queues(q, false); } @@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); } @@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2da818a48097..bc917956a6d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -444,7 +444,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; - int mq_freeze_depth; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; -- cgit v1.2.3