diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 20:23:25 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 20:23:25 +0300 |
commit | 73ba2fb33c492916853dfe63e3b3163da0be661d (patch) | |
tree | c2fda8ca1273744d2e884d24189a15ac1a7d63c2 /block/blk-mq.c | |
parent | 958f338e96f874a0d29442396d6adf9c1e17aa2d (diff) | |
parent | b86d865cb1cae1e61527ea0b8977078bbf694328 (diff) | |
download | linux-73ba2fb33c492916853dfe63e3b3163da0be661d.tar.xz |
Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"First pull request for this merge window, there will also be a
followup request with some stragglers.
This pull request contains:
- Fix for a thundering heard issue in the wbt block code (Anchal
Agarwal)
- A few NVMe pull requests:
* Improved tracepoints (Keith)
* Larger inline data support for RDMA (Steve Wise)
* RDMA setup/teardown fixes (Sagi)
* Effects log suppor for NVMe target (Chaitanya Kulkarni)
* Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
* TP4004 (ANA) support (Christoph)
* Various NVMe fixes
- Block io-latency controller support. Much needed support for
properly containing block devices. (Josef)
- Series improving how we handle sense information on the stack
(Kees)
- Lightnvm fixes and updates/improvements (Mathias/Javier et al)
- Zoned device support for null_blk (Matias)
- AIX partition fixes (Mauricio Faria de Oliveira)
- DIF checksum code made generic (Max Gurtovoy)
- Add support for discard in iostats (Michael Callahan / Tejun)
- Set of updates for BFQ (Paolo)
- Removal of async write support for bsg (Christoph)
- Bio page dirtying and clone fixups (Christoph)
- Set of bcache fix/changes (via Coly)
- Series improving blk-mq queue setup/teardown speed (Ming)
- Series improving merging performance on blk-mq (Ming)
- Lots of other fixes and cleanups from a slew of folks"
* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
blkcg: Make blkg_root_lookup() work for queues in bypass mode
bcache: fix error setting writeback_rate through sysfs interface
null_blk: add lock drop/acquire annotation
Blk-throttle: reduce tail io latency when iops limit is enforced
block: paride: pd: mark expected switch fall-throughs
block: Ensure that a request queue is dissociated from the cgroup controller
block: Introduce blk_exit_queue()
blkcg: Introduce blkg_root_lookup()
block: Remove two superfluous #include directives
blk-mq: count the hctx as active before allocating tag
block: bvec_nr_vecs() returns value for wrong slab
bcache: trivial - remove tailing backslash in macro BTREE_FLAG
bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
bcache: set max writeback rate when I/O request is idle
bcache: add code comments for bset.c
bcache: fix mistaken comments in request.c
bcache: fix mistaken code comments in bcache.h
bcache: add a comment in super.c
bcache: avoid unncessary cache prefetch bch_btree_node_get()
bcache: display rate debug parameters to 0 when writeback is not running
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 173 |
1 files changed, 113 insertions, 60 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 654b0dc7e001..72a0033ccee9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -34,8 +34,8 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-stat.h" -#include "blk-wbt.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (blk_mq_tag_busy(data->hctx)) { + if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, if (!op_is_flush(op) && e->type->ops.mq.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.mq.limit_depth(op, data); + } else { + blk_mq_tag_busy(data->hctx); } tag = blk_mq_get_tag(data); @@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); - wbt_done(q->rq_wb, rq); + rq_qos_done(q, rq); if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); @@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_account_io_done(rq, now); if (rq->end_io) { - wbt_done(rq->q->rq_wb, rq); + rq_qos_done(rq->q, rq); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq) rq->throtl_size = blk_rq_sectors(rq); #endif rq->rq_flags |= RQF_STATS; - wbt_issue(q->rq_wb, rq); + rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); @@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), - .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + .flags = BLK_MQ_REQ_NOWAIT, }; - - might_sleep_if(wait); + bool shared; if (rq->tag != -1) goto done; @@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; + shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { - if (blk_mq_tag_busy(data.hctx)) { + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } @@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, } done: - if (hctx) - *hctx = data.hctx; return rq->tag != -1; } @@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + spin_lock(&hctx->dispatch_wait_lock); list_del_init(&wait->entry); + spin_unlock(&hctx->dispatch_wait_lock); + blk_mq_run_hw_queue(hctx, true); return 1; } @@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, * restart. For both cases, take care to check the condition again after * marking us as waiting. */ -static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *this_hctx = *hctx; - struct sbq_wait_state *ws; + struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * It's possible that a tag was freed in the window between the @@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq, hctx, false); + return blk_mq_get_driver_tag(rq); } - wait = &this_hctx->dispatch_wait; + wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&this_hctx->lock); + wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + + spin_lock_irq(&wq->lock); + spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq, wait); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq, hctx, false); + ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } @@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ - spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return true; } +#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 +#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 +/* + * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): + * - EWMA is one simple way to compute running average value + * - weight(7/8 and 1/8) is applied so that it can decrease exponentially + * - take 4 as factor for avoiding to get too small(0) result, and this + * factor doesn't matter because EWMA decreases exponentially + */ +static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) +{ + unsigned int ewma; + + if (hctx->queue->elevator) + return; + + ewma = hctx->dispatch_busy; + + if (!ewma && !busy) + return; + + ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; + if (busy) + ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; + ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; + + hctx->dispatch_busy = ewma; +} + #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ /* @@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_mark_tag_wait(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(hctx, rq)) { blk_mq_put_dispatch_budget(hctx); /* * For non-shared tags, the RESTART check @@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + bd.last = !blk_mq_get_driver_tag(nxt); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, else if (needs_restart && (ret == BLK_STS_RESOURCE)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); + blk_mq_update_dispatch_busy(hctx, true); return false; - } + } else + blk_mq_update_dispatch_busy(hctx, false); /* * If the host/device is unable to accept more work, inform the @@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list) { + struct request *rq; + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ - spin_lock(&ctx->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); + list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - list_del_init(&rq->queuelist); - __blk_mq_insert_req_list(hctx, rq, false); + trace_block_rq_insert(hctx->queue, rq); } + + spin_lock(&ctx->lock); + list_splice_tail_init(list, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } @@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: + blk_mq_update_dispatch_busy(hctx, false); *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: + blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: + blk_mq_update_dispatch_busy(hctx, false); *cookie = BLK_QC_T_NONE; break; } @@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); goto insert; } @@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) return ret; } +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list) +{ + while (!list_empty(list)) { + blk_status_t ret; + struct request *rq = list_first_entry(list, struct request, + queuelist); + + list_del_init(&rq->queuelist); + ret = blk_mq_request_issue_directly(rq); + if (ret != BLK_STS_OK) { + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + list_add(&rq->queuelist, list); + break; + } + blk_mq_end_request(rq, ret); + } + } +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); @@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; - unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; } - wbt_track(rq, wb_acct); + rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); @@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if (q->nr_hw_queues > 1 && is_sync) { + } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && + !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); @@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); @@ -2331,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); + if (shared) hctx->flags |= BLK_MQ_F_TAG_SHARED; - } else { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); + else hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } } } @@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); INIT_LIST_HEAD(&q->tag_set_list); } @@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { if (set->ops->map_queues) { - int cpu; /* * transport .map_queues is usually done in the following * way: @@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return set->ops->map_queues(set); } else @@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the - * requested depth down, if if it too large. In that case, the set + * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) |