diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 315 |
1 files changed, 217 insertions, 98 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 4b2c8e940f59..270cfd9fc6b0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -92,7 +92,7 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; - if (rq->part && blk_do_io_stat(rq) && + if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; @@ -120,9 +120,59 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, inflight[1] = mi.inflight[1]; } -void blk_freeze_queue_start(struct request_queue *q) +#ifdef CONFIG_LOCKDEP +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + if (!owner) + return false; + + if (!q->mq_freeze_depth) { + q->mq_freeze_owner = owner; + q->mq_freeze_owner_depth = 1; + return true; + } + + if (owner == q->mq_freeze_owner) + q->mq_freeze_owner_depth += 1; + return false; +} + +/* verify the last unfreeze in owner context */ +static bool blk_unfreeze_check_owner(struct request_queue *q) { + if (!q->mq_freeze_owner) + return false; + if (q->mq_freeze_owner != current) + return false; + if (--q->mq_freeze_owner_depth == 0) { + q->mq_freeze_owner = NULL; + return true; + } + return false; +} + +#else + +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + return false; +} + +static bool blk_unfreeze_check_owner(struct request_queue *q) +{ + return false; +} +#endif + +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner) +{ + bool freeze; + mutex_lock(&q->mq_freeze_lock); + freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); @@ -131,6 +181,14 @@ void blk_freeze_queue_start(struct request_queue *q) } else { mutex_unlock(&q->mq_freeze_lock); } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q, current)) + blk_freeze_acquire_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -149,35 +207,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue(struct request_queue *q) { - /* - * In the !blk_mq case we are only calling this to kill the - * q_usage_counter, otherwise this increases the freeze depth - * and waits for it to return to zero. For this reason there is - * no blk_unfreeze_queue(), and blk_freeze_queue() is not - * exported to drivers as the only user for unfreeze is blk_mq. - */ blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } - -void blk_mq_freeze_queue(struct request_queue *q) -{ - /* - * ...just an alias to keep freeze and unfreeze actions balanced - * in the blk_mq_* namespace - */ - blk_freeze_queue(q); -} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { + bool unfreeze; + mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -187,16 +227,40 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; } void blk_mq_unfreeze_queue(struct request_queue *q) { - __blk_mq_unfreeze_queue(q, false); + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + __blk_freeze_queue_start(q, NULL); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); + +/* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ @@ -283,8 +347,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); @@ -331,14 +396,9 @@ EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = blk_time_get_ns(); - else - rq->start_time_ns = 0; - #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) - rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif @@ -359,8 +419,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) - data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -420,7 +478,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add(data->cached_rq, rq); + rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) @@ -429,7 +487,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; - return rq_list_pop(data->cached_rq); + return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) @@ -526,7 +584,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, .flags = flags, .cmd_flags = opf, .nr_tags = plug->nr_ios, - .cached_rq = &plug->cached_rq, + .cached_rqs = &plug->cached_rqs, }; struct request *rq; @@ -551,14 +609,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; - if (rq_list_empty(plug->cached_rq)) { + if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; @@ -567,8 +625,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; - plug->cached_rq = rq_list_next(rq); - blk_mq_rq_time_init(rq, 0); + rq_list_pop(&plug->cached_rqs); + blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; @@ -744,7 +802,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } @@ -764,7 +822,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (req->part && blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -784,7 +842,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -982,8 +1040,7 @@ static inline void blk_account_io_done(struct request *req, u64 now) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) { + if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -996,28 +1053,63 @@ static inline void blk_account_io_done(struct request *req, u64 now) } } +static inline bool blk_rq_passthrough_stats(struct request *req) +{ + struct bio *bio = req->bio; + + if (!blk_queue_passthrough_stat(req->q)) + return false; + + /* Requests without a bio do not transfer data. */ + if (!bio) + return false; + + /* + * Stats are accumulated in the bdev, so must have one attached to a + * bio to track stats. Most drivers do not set the bdev for passthrough + * requests, but nvme is one that will set it. + */ + if (!bio->bi_bdev) + return false; + + /* + * We don't know what a passthrough command does, but we know the + * payload size and data direction. Ensuring the size is aligned to the + * block size filters out most commands with payloads that don't + * represent sector access. + */ + if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) + return false; + return true; +} + static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); - if (blk_do_io_stat(req)) { - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (req->bio) - req->part = req->bio->bi_bdev; - else - req->part = req->q->disk->part0; + if (!blk_queue_io_stat(req->q)) + return; + if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) + return; - part_stat_lock(); - update_io_ticks(req->part, jiffies, false); - part_stat_local_inc(req->part, - in_flight[op_is_write(req_op(req))]); - part_stat_unlock(); - } + req->rq_flags |= RQF_IO_STAT; + req->start_time_ns = blk_time_get_ns(); + + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); + part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) @@ -1300,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); + rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } @@ -1698,7 +1789,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; @@ -2200,6 +2290,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); +static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) +{ + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + return need_run; +} + /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. @@ -2220,20 +2328,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); - /* - * When queue is quiesced, we may be switching io scheduler, or - * updating nr_hw_queues, or other things, and we can't run queue - * any more, even __blk_mq_hctx_has_pending() can't be called safely. - * - * And queue will be rerun in blk_mq_unquiesce_queue() if it is - * quiesced. - */ - __blk_mq_run_dispatch_ops(hctx->queue, false, - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx)); + need_run = blk_mq_hw_queue_need_run(hctx); + if (!need_run) { + unsigned long flags; - if (!need_run) - return; + /* + * Synchronize with blk_mq_unquiesce_queue(), because we check + * if hw queue is quiesced locklessly above, we need the use + * ->queue_lock to make sure we see the up-to-date status to + * not miss rerunning the hw queue. + */ + spin_lock_irqsave(&hctx->queue->queue_lock, flags); + need_run = blk_mq_hw_queue_need_run(hctx); + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); + + if (!need_run) + return; + } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); @@ -2390,6 +2501,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + /* + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch + * list in the subsequent routine. + */ + smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); @@ -2542,7 +2659,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -2620,6 +2736,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } @@ -2650,6 +2767,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } @@ -2666,7 +2784,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug) blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); + bool last = rq_list_empty(&plug->mq_list); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2709,8 +2827,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); @@ -2724,12 +2841,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { - rq_list_add_tail(&requeue_lastp, rq); + rq_list_add_tail(&requeue_list, rq); continue; } - list_add(&rq->queuelist, &list); + list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); @@ -2784,19 +2901,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_plug_list(plug, from_schedule); - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2861,7 +2978,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; + data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); @@ -2884,7 +3001,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && @@ -2898,17 +3015,17 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (rq_list_pop(&plug->cached_rqs) != rq) + WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ - plug->cached_rq = rq_list_next(rq); rq_qos_throttle(rq->q, bio); - blk_mq_rq_time_init(rq, 0); + blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } @@ -3187,8 +3304,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -4310,6 +4425,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + /* + * ->tag_set has to be setup before initialize hctx, which cpuphp + * handler needs it for checking queue mapping + */ + q->tag_set = set; + if (blk_mq_alloc_ctxs(q)) goto err_exit; @@ -4328,8 +4449,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->tag_set = set; - q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); |