diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-iosched.c | 107 | ||||
-rw-r--r-- | block/blk-core.c | 14 | ||||
-rw-r--r-- | block/blk-merge.c | 29 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 20 | ||||
-rw-r--r-- | block/blk-wbt.c | 10 |
6 files changed, 148 insertions, 34 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 47e6ec7427c4..aeca22d91101 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) } /* - * We exploit the bfq_finish_request hook to decrement - * rq_in_driver, but bfq_finish_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, - * rq_in_driver is deceptively lower than it should be - * while this request is in service. This may cause - * bfq_schedule_dispatch to be invoked uselessly. + * We exploit the bfq_finish_requeue_request hook to + * decrement rq_in_driver, but + * bfq_finish_requeue_request will not be invoked on + * this request. So, to avoid unbalance, just start + * this request, without incrementing rq_in_driver. As + * a negative consequence, rq_in_driver is deceptively + * lower than it should be while this request is in + * service. This may cause bfq_schedule_dispatch to be + * invoked uselessly. * * As for implementing an exact solution, the - * bfq_finish_request hook, if defined, is probably - * invoked also on this request. So, by exploiting - * this hook, we could 1) increment rq_in_driver here, - * and 2) decrement it in bfq_finish_request. Such a - * solution would let the value of the counter be - * always accurate, but it would entail using an extra - * interface function. This cost seems higher than the - * benefit, being the frequency of non-elevator-private + * bfq_finish_requeue_request hook, if defined, is + * probably invoked also on this request. So, by + * exploiting this hook, we could 1) increment + * rq_in_driver here, and 2) decrement it in + * bfq_finish_requeue_request. Such a solution would + * let the value of the counter be always accurate, + * but it would entail using an extra interface + * function. This cost seems higher than the benefit, + * being the frequency of non-elevator-private * requests very low. */ goto start_rq; @@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif +static void bfq_prepare_request(struct request *rq, struct bio *bio); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { + if (WARN_ON_ONCE(!bfqq)) { + /* + * This should never happen. Most likely rq is + * a requeued regular request, being + * re-inserted without being first + * re-prepared. Do a prepare, to avoid + * failure. + */ + bfq_prepare_request(rq, rq->bio); + bfqq = RQ_BFQQ(rq); + } + idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_request_body(struct bfq_queue *bfqq) +static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { bfqq->allocated--; bfq_put_queue(bfqq); } -static void bfq_finish_request(struct request *rq) +/* + * Handle either a requeue or a finish for rq. The things to do are + * the same in both cases: all references to rq are to be dropped. In + * particular, rq is considered completed from the point of view of + * the scheduler. + */ +static void bfq_finish_requeue_request(struct request *rq) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; - if (!rq->elv.icq) + /* + * Requeue and finish hooks are invoked in blk-mq without + * checking whether the involved request is actually still + * referenced in the scheduler. To handle this fact, the + * following two checks make this function exit in case of + * spurious invocations, for which there is nothing to do. + * + * First, check whether rq has nothing to do with an elevator. + */ + if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) + return; + + /* + * rq either is not associated with any icq, or is an already + * requeued request that has not (yet) been re-inserted into + * a bfq_queue. + */ + if (!rq->elv.icq || !bfqq) return; - bfqq = RQ_BFQQ(rq); bfqd = bfqq->bfqd; if (rq->rq_flags & RQF_STARTED) @@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq) spin_lock_irqsave(&bfqd->lock, flags); bfq_completed_request(bfqq, bfqd); - bfq_finish_request_body(bfqq); + bfq_finish_requeue_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } else { /* * Request rq may be still/already in the scheduler, - * in which case we need to remove it. And we cannot + * in which case we need to remove it (this should + * never happen in case of requeue). And we cannot * defer such a check and removal, to avoid * inconsistencies in the time interval from the end * of this function to the start of the deferred work. @@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } - bfq_finish_request_body(bfqq); + bfq_finish_requeue_request_body(bfqq); } + /* + * Reset private fields. In case of a requeue, this allows + * this function to correctly do nothing if it is spuriously + * invoked again on this same request (see the check at the + * beginning of the function). Probably, a better general + * design would be to prevent blk-mq from invoking the requeue + * or finish hooks of an elevator, for a request that is not + * referred by that elevator. + * + * Resetting the following fields would break the + * request-insertion logic if rq is re-inserted into a bfq + * internal queue, without a re-preparation. Here we assume + * that re-insertions of requeued requests, without + * re-preparation, can happen only for pass_through or at_head + * requests (which are not re-inserted into bfq internal + * queues). + */ rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; } @@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = { .ops.mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, + .requeue_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_requeue_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/blk-core.c b/block/blk-core.c index a2005a485335..2d1a7bbe0634 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #include <linux/debugfs.h> +#include <linux/bpf.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -145,6 +146,7 @@ static const struct { [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ @@ -2082,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) return false; } +static noinline int should_fail_bio(struct bio *bio) +{ + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + return -EIO; + return 0; +} +ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2173,7 +2183,7 @@ generic_make_request_checks(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_bio(bio)) goto end_io; if (!bio->bi_partno) { @@ -3282,6 +3292,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, { if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); + else if (bio_op(bio) == REQ_OP_DISCARD) + rq->nr_phys_segments = 1; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; diff --git a/block/blk-merge.c b/block/blk-merge.c index 8452fc7164cc..782940c65d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -550,6 +550,24 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } +static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, + struct request *next) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(next->bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -683,9 +701,13 @@ static struct request *attempt_merge(struct request_queue *q, * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn * will have updated segment counts, update sector - * counts here. + * counts here. Handle DISCARDs separately, as they + * have separate settings. */ - if (!ll_merge_requests_fn(q, req, next)) + if (req_op(req) == REQ_OP_DISCARD) { + if (!req_attempt_discard_merge(q, req, next)) + return NULL; + } else if (!ll_merge_requests_fn(q, req, next)) return NULL; /* @@ -715,7 +737,8 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - elv_merge_requests(q, req, next); + if (req_op(req) != REQ_OP_DISCARD) + elv_merge_requests(q, req, next); /* * 'next' is going away, so update stats accordingly diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55c0a745b427..25c14c58385c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -259,6 +259,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, if (!*merged_request) elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); default: return false; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 01f271d40825..df93102e2149 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1162,6 +1162,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, return true; } +#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ + bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { @@ -1169,6 +1171,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, struct request *rq, *nxt; bool no_tag = false; int errors, queued; + blk_status_t ret = BLK_STS_OK; if (list_empty(list)) return false; @@ -1181,7 +1184,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, errors = queued = 0; do { struct blk_mq_queue_data bd; - blk_status_t ret; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { @@ -1226,7 +1228,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE) { + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { /* * If an I/O scheduler has been configured and we got a * driver tag for the next request already, free it @@ -1257,6 +1259,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * that is where we will continue on next queue run. */ if (!list_empty(list)) { + bool needs_restart; + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1280,10 +1284,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. + * + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART + * bit is set, run queue after a delay to avoid IO stalls + * that could otherwise occur if the queue is idle. */ - if (!blk_mq_sched_needs_restart(hctx) || + needs_restart = blk_mq_sched_needs_restart(hctx); + if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); + else if (needs_restart && (ret == BLK_STS_RESOURCE)) + blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); } return (queued + errors) != 0; @@ -1764,6 +1775,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, *cookie = new_cookie; break; case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: __blk_mq_requeue_request(rq); break; default: @@ -1826,7 +1838,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); - if (ret == BLK_STS_RESOURCE) + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_sched_insert_request(rq, false, true, false); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ae8de9780085..f92fc84b5e2c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - return rq_data_dir(rq); + const int op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) + return WRITE; + + /* don't account */ + return -1; } int wbt_init(struct request_queue *q) |