From abfc426d1b2fb2176df59851a64223b58ddae7e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2022 17:01:09 +0100 Subject: block: pass a block_device to bio_clone_fast Pass a block_device to bio_clone_fast and __bio_clone_fast and give the functions more suitable names. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220202160109.108149-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 1adfe4824ef5..4b868e792ba4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2975,10 +2975,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); + bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, + bs); if (!bio) goto free_and_out; - bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; -- cgit v1.2.3 From d5869fdc189f0f12a954a48d58a48104a2f5d044 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 10 Feb 2022 14:52:22 -0800 Subject: block: introduce block_rq_error tracepoint Currently, rasdaemon uses the existing tracepoint block_rq_complete and filters out non-error cases in order to capture block disk errors. But there are a few problems with this approach: 1. Even kernel trace filter could do the filtering work, there is still some overhead after we enable this tracepoint. 2. The filter is merely based on errno, which does not align with kernel logic to check the errors for print_req_error(). 3. block_rq_complete only provides dev major and minor to identify the block device, it is not convenient to use in user-space. So introduce a new tracepoint block_rq_error just for the error case. With this patch, rasdaemon could switch to block_rq_error. Since the new tracepoint has the similar implementation with block_rq_complete, so move the existing code from TRACE_EVENT block_rq_complete() into new event class block_rq_completion(). Then add event for block_rq_complete and block_rq_err respectively from the newly created event class per the suggestion from Chaitanya Kulkarni. Cc: Jens Axboe Cc: Christoph Hellwig Reviewed-by: Steven Rostedt Signed-off-by: Cong Wang Signed-off-by: Chaitanya Kulkarni Signed-off-by: Yang Shi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220210225222.260069-1-shy828301@gmail.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- include/trace/events/block.h | 49 ++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 14 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4b868e792ba4..6c59ffe765fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -789,8 +789,10 @@ bool blk_update_request(struct request *req, blk_status_t error, #endif if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) + !(req->rq_flags & RQF_QUIET))) { blk_print_req_error(req, error); + trace_block_rq_error(req, error, nr_bytes); + } blk_account_io_completion(req, nr_bytes); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 27170e40e8c9..7f4dfbdf12a6 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -100,19 +100,7 @@ TRACE_EVENT(block_rq_requeue, __entry->nr_sector, 0) ); -/** - * block_rq_complete - block IO operation completed by device driver - * @rq: block operations request - * @error: status code - * @nr_bytes: number of completed bytes - * - * The block_rq_complete tracepoint event indicates that some portion - * of operation request has been completed by the device driver. If - * the @rq->bio is %NULL, then there is absolutely no additional work to - * do for the request. If @rq->bio is non-NULL then there is - * additional work required to complete the request. - */ -TRACE_EVENT(block_rq_complete, +DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), @@ -144,6 +132,41 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector, __entry->error) ); +/** + * block_rq_complete - block IO operation completed by device driver + * @rq: block operations request + * @error: status code + * @nr_bytes: number of completed bytes + * + * The block_rq_complete tracepoint event indicates that some portion + * of operation request has been completed by the device driver. If + * the @rq->bio is %NULL, then there is absolutely no additional work to + * do for the request. If @rq->bio is non-NULL then there is + * additional work required to complete the request. + */ +DEFINE_EVENT(block_rq_completion, block_rq_complete, + + TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), + + TP_ARGS(rq, error, nr_bytes) +); + +/** + * block_rq_error - block IO operation error reported by device driver + * @rq: block operations request + * @error: status code + * @nr_bytes: number of completed bytes + * + * The block_rq_error tracepoint event indicates that some portion + * of operation request has failed as reported by the device driver. + */ +DEFINE_EVENT(block_rq_completion, block_rq_error, + + TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), + + TP_ARGS(rq, error, nr_bytes) +); + DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), -- cgit v1.2.3 From 248c793359daacd826a7507a258ffe41653efef7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:36 +0100 Subject: blk-mq: make the blk-mq stacking code optional The code to stack blk-mq drivers is only used by dm-multipath, and will preferably stay that way. Make it optional and only selected by device mapper, so that the buildbots more easily catch abuses like the one that slipped in in the ufs driver in the last merged window. Another positive side effects is that kernel builds without device mapper shrink a little bit as well. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 3 +++ block/blk-mq.c | 2 ++ drivers/md/Kconfig | 1 + 3 files changed, 6 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/Kconfig b/block/Kconfig index 205f8d01c695..168b873eb666 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -230,6 +230,9 @@ config BLK_PM config BLOCK_HOLDER_DEPRECATED bool +config BLK_MQ_STACKING + bool + source "block/Kconfig.iosched" endif # BLOCK diff --git a/block/blk-mq.c b/block/blk-mq.c index 6c59ffe765fd..db62d34afb63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2840,6 +2840,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_try_issue_directly(rq->mq_hctx, rq)); } +#ifdef CONFIG_BLK_MQ_STACKING /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits @@ -3017,6 +3018,7 @@ free_and_out: return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); +#endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b5ea378e66cb..998a5cfdbc4e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -204,6 +204,7 @@ config BLK_DEV_DM tristate "Device mapper support" select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN + select BLK_MQ_STACKING depends on DAX || DAX=n help Device-mapper is a low level volume manager. It works by allowing -- cgit v1.2.3 From a5efda3c46a1db9a579d953667906933d5037bf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:37 +0100 Subject: blk-mq: fold blk_cloned_rq_check_limits into blk_insert_cloned_request Fold blk_cloned_rq_check_limits into its only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index db62d34afb63..fc132933397f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2842,26 +2842,14 @@ void blk_mq_submit_bio(struct bio *bio) #ifdef CONFIG_BLK_MQ_STACKING /** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* @@ -2893,22 +2881,6 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, return BLK_STS_IOERR; } - return BLK_STS_OK; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - if (rq->q->disk && should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; -- cgit v1.2.3 From 28db4711bf48303814dcfd8d41a41106e90bc374 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:38 +0100 Subject: blk-mq: remove the request_queue argument to blk_insert_cloned_request The request must be submitted to the queue it was allocated for, so remove the extra request_queue argument. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++----- drivers/md/dm-rq.c | 2 +- include/linux/blk-mq.h | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index fc132933397f..886836a54064 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2843,11 +2843,11 @@ void blk_mq_submit_bio(struct bio *bio) #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request * @rq: the request being queued */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t blk_insert_cloned_request(struct request *rq) { + struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); blk_status_t ret; @@ -2881,8 +2881,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return BLK_STS_IOERR; } - if (rq->q->disk && - should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -2895,7 +2894,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_run_dispatch_ops(rq->q, + blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, ktime_get_ns()); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 579ab6183d4d..2fcc9b7f391b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -311,7 +311,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ clone->rq_flags |= RQF_IO_STAT; clone->start_time_ns = ktime_get_ns(); - r = blk_insert_cloned_request(clone->q, clone); + r = blk_insert_cloned_request(clone); if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d319ffa59354..3a41d50b85d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -952,8 +952,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); void blk_rq_unprep_clone(struct request *rq); -blk_status_t blk_insert_cloned_request(struct request_queue *q, - struct request *rq); +blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; -- cgit v1.2.3 From 7f36b7d02a287ed18d02ae821868aa07b0235521 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Feb 2022 12:45:08 +0800 Subject: block: move blk_crypto_bio_prep() out of blk-mq.c blk_crypto_bio_prep() is called for both bio based and blk-mq drivers, so move it out of blk-mq.c, then we can unify this kind of handling. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220216044514.2903784-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 21 ++++++++------------- block/blk-mq.c | 3 --- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-core.c b/block/blk-core.c index d4a023667ac1..f03fff1fa391 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -783,24 +783,19 @@ end_io: return false; } -static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) -{ - if (blk_crypto_bio_prep(&bio)) { - if (likely(bio_queue_enter(bio) == 0)) { - disk->fops->submit_bio(bio); - blk_queue_exit(disk->queue); - } - } -} - static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - if (!disk->fops->submit_bio) + if (unlikely(!blk_crypto_bio_prep(&bio))) + return; + + if (!disk->fops->submit_bio) { blk_mq_submit_bio(bio); - else - __submit_bio_fops(disk, bio); + } else if (likely(bio_queue_enter(bio) == 0)) { + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 886836a54064..7ca0b47246a6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2788,9 +2788,6 @@ void blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs = 1; blk_status_t ret; - if (unlikely(!blk_crypto_bio_prep(&bio))) - return; - blk_queue_bounce(q, &bio); if (blk_may_split(q, bio)) __blk_queue_split(q, &bio, &nr_segs); -- cgit v1.2.3 From 8f5fea65b06de1cc51d4fc23fb4d378d1abd6ed7 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 31 Jan 2022 15:33:37 -0500 Subject: blk-mq: avoid extending delays of active hctx from blk_mq_delay_run_hw_queues When blk_mq_delay_run_hw_queues sets an hctx to run in the future, it can reset the delay length for an already pending delayed work run_work. This creates a scenario where multiple hctx may have their queues set to run, but if one runs first and finds nothing to do, it can reset the delay of another hctx and stall the other hctx's ability to run requests. To avoid this I/O stall when an hctx's run_work is already pending, leave it untouched to run at its current designated time rather than extending its delay. The work will still run which keeps closed the race calling blk_mq_delay_run_hw_queues is needed for while also avoiding the I/O stall. Signed-off-by: David Jeffery Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220131203337.GA17666@redhat Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7ca0b47246a6..a05ce7725031 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2179,6 +2179,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; + /* + * If there is already a run_work pending, leave the + * pending delay untouched. Otherwise, a hctx can stall + * if another hctx is re-delaying the other's work + * before the work executes. + */ + if (delayed_work_pending(&hctx->run_work)) + continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the -- cgit v1.2.3