diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-06-03 20:14:48 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-06-03 20:14:48 +0300 |
commit | 34845d92bca527b5c2cf8b2293b71b9c746c79ca (patch) | |
tree | a1fd77581f6275843d0305a966d42d5b1b4748cf | |
parent | 5ac8bdb9ad47334a9590e29daf7e4149b0a34729 (diff) | |
parent | 41e46b3c2aa24f755b2ae9ec4ce931ba5f0d8532 (diff) | |
download | linux-34845d92bca527b5c2cf8b2293b71b9c746c79ca.tar.xz |
Merge tag 'for-5.19/block-2022-06-02' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
"Just a collection of fixes that have been queued up since the initial
merge window pull request, the majority of which are targeted for
stable as well.
One bio_set fix that fixes an issue with the dm adoption of cached bio
structs that got introduced in this merge window"
* tag 'for-5.19/block-2022-06-02' of git://git.kernel.dk/linux-block:
block: Fix potential deadlock in blk_ia_range_sysfs_show()
block: fix bio_clone_blkg_association() to associate with proper blkcg_gq
block: remove useless BUG_ON() in blk_mq_put_tag()
blk-mq: do not update io_ticks with passthrough requests
block: make bioset_exit() fully resilient against being called twice
block: use bio_queue_enter instead of blk_queue_enter in bio_poll
block: document BLK_STS_AGAIN usage
block: take destination bvec offsets into account in bio_copy_data_iter
blk-iolatency: Fix inflight count imbalances and IO hangs on offline
blk-mq: don't touch ->tagset in blk_mq_get_sq_hctx
-rw-r--r-- | block/bio.c | 9 | ||||
-rw-r--r-- | block/blk-cgroup.c | 8 | ||||
-rw-r--r-- | block/blk-core.c | 2 | ||||
-rw-r--r-- | block/blk-ia-ranges.c | 7 | ||||
-rw-r--r-- | block/blk-iolatency.c | 122 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 10 | ||||
-rw-r--r-- | include/linux/blk_types.h | 4 |
8 files changed, 83 insertions, 80 deletions
diff --git a/block/bio.c b/block/bio.c index a3893d80dccc..f92d0223247b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -722,6 +722,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs) bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); + bs->cache = NULL; } /** @@ -1366,10 +1367,12 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); - void *src_buf; + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); - src_buf = bvec_kmap_local(&src_bv); - memcpy_to_bvec(&dst_bv, src_buf); + memcpy(dst_buf, src_buf, bytes); + + kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 40161a3f68d0..764e740b0c0f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1974,12 +1974,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) { - if (dst->bi_blkg) - blkg_put(dst->bi_blkg); - blkg_get(src->bi_blkg); - dst->bi_blkg = src->bi_blkg; - } + if (src->bi_blkg) + bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); diff --git a/block/blk-core.c b/block/blk-core.c index 80fa73c419a9..06ff5bbfe8f6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -939,7 +939,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) blk_flush_plug(current->plug, false); - if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) + if (bio_queue_enter(bio)) return 0; if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 18c68d8b9138..56ed48d2954e 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -54,13 +54,8 @@ static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj, container_of(attr, struct blk_ia_range_sysfs_entry, attr); struct blk_independent_access_range *iar = container_of(kobj, struct blk_independent_access_range, kobj); - ssize_t ret; - mutex_lock(&iar->queue->sysfs_lock); - ret = entry->show(iar, buf); - mutex_unlock(&iar->queue->sysfs_lock); - - return ret; + return entry->show(iar, buf); } static const struct sysfs_ops blk_ia_range_sysfs_ops = { diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 5b676c7cf2b6..9568bf8dfe82 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -87,7 +87,17 @@ struct iolatency_grp; struct blk_iolatency { struct rq_qos rqos; struct timer_list timer; - atomic_t enabled; + + /* + * ->enabled is the master enable switch gating the throttling logic and + * inflight tracking. The number of cgroups which have iolat enabled is + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly + * from ->enable_work with the request_queue frozen. For details, See + * blkiolatency_enable_work_fn(). + */ + bool enabled; + atomic_t enable_cnt; + struct work_struct enable_work; }; static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) @@ -95,11 +105,6 @@ static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) return container_of(rqos, struct blk_iolatency, rqos); } -static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) -{ - return atomic_read(&blkiolat->enabled) > 0; -} - struct child_latency_info { spinlock_t lock; @@ -464,7 +469,7 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); - if (!blk_iolatency_enabled(blkiolat)) + if (!blkiolat->enabled) return; while (blkg && blkg->parent) { @@ -594,7 +599,6 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) u64 window_start; u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); - bool enabled = false; int inflight = 0; blkg = bio->bi_blkg; @@ -605,8 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat) return; - enabled = blk_iolatency_enabled(iolat->blkiolat); - if (!enabled) + if (!iolat->blkiolat->enabled) return; now = ktime_to_ns(ktime_get()); @@ -645,6 +648,7 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); del_timer_sync(&blkiolat->timer); + flush_work(&blkiolat->enable_work); blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); kfree(blkiolat); } @@ -716,6 +720,44 @@ next: rcu_read_unlock(); } +/** + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device + * @work: enable_work of the blk_iolatency of interest + * + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This + * is relatively expensive as it involves walking up the hierarchy twice for + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we + * want to disable the in-flight tracking. + * + * We have to make sure that the counting is balanced - we don't want to leak + * the in-flight counts by disabling accounting in the completion path while IOs + * are in flight. This is achieved by ensuring that no IO is in flight by + * freezing the queue while flipping ->enabled. As this requires a sleepable + * context, ->enabled flipping is punted to this work function. + */ +static void blkiolatency_enable_work_fn(struct work_struct *work) +{ + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, + enable_work); + bool enabled; + + /* + * There can only be one instance of this function running for @blkiolat + * and it's guaranteed to be executed at least once after the latest + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is + * sufficient. + * + * Also, we know @blkiolat is safe to access as ->enable_work is flushed + * in blkcg_iolatency_exit(). + */ + enabled = atomic_read(&blkiolat->enable_cnt); + if (enabled != blkiolat->enabled) { + blk_mq_freeze_queue(blkiolat->rqos.q); + blkiolat->enabled = enabled; + blk_mq_unfreeze_queue(blkiolat->rqos.q); + } +} + int blk_iolatency_init(struct request_queue *q) { struct blk_iolatency *blkiolat; @@ -741,17 +783,15 @@ int blk_iolatency_init(struct request_queue *q) } timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; } -/* - * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise - * return 0. - */ -static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) { struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; u64 oldval = iolat->min_lat_nsec; iolat->min_lat_nsec = val; @@ -759,13 +799,15 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, BLKIOLATENCY_MAX_WIN_SIZE); - if (!oldval && val) - return 1; + if (!oldval && val) { + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) + schedule_work(&blkiolat->enable_work); + } if (oldval && !val) { blkcg_clear_delay(blkg); - return -1; + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) + schedule_work(&blkiolat->enable_work); } - return 0; } static void iolatency_clear_scaling(struct blkcg_gq *blkg) @@ -797,7 +839,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, u64 lat_val = 0; u64 oldval; int ret; - int enable = 0; ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); if (ret) @@ -832,41 +873,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, blkg = ctx.blkg; oldval = iolat->min_lat_nsec; - enable = iolatency_set_min_lat_nsec(blkg, lat_val); - if (enable) { - if (!blk_get_queue(blkg->q)) { - ret = -ENODEV; - goto out; - } - - blkg_get(blkg); - } - - if (oldval != iolat->min_lat_nsec) { + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) iolatency_clear_scaling(blkg); - } - ret = 0; out: blkg_conf_finish(&ctx); - if (ret == 0 && enable) { - struct iolatency_grp *tmp = blkg_to_lat(blkg); - struct blk_iolatency *blkiolat = tmp->blkiolat; - - blk_mq_freeze_queue(blkg->q); - - if (enable == 1) - atomic_inc(&blkiolat->enabled); - else if (enable == -1) - atomic_dec(&blkiolat->enabled); - else - WARN_ON_ONCE(1); - - blk_mq_unfreeze_queue(blkg->q); - - blkg_put(blkg); - blk_put_queue(blkg->q); - } return ret ?: nbytes; } @@ -1005,14 +1017,8 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct blk_iolatency *blkiolat = iolat->blkiolat; - int ret; - ret = iolatency_set_min_lat_nsec(blkg, 0); - if (ret == 1) - atomic_inc(&blkiolat->enabled); - if (ret == -1) - atomic_dec(&blkiolat->enabled); + iolatency_set_min_lat_nsec(blkg, 0); iolatency_clear_scaling(blkg); } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 68ac23d0b640..2dcd738c6952 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -228,7 +228,6 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { - BUG_ON(tag >= tags->nr_reserved_tags); sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index ae116b755648..30e4bdcd8d7f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -133,7 +133,8 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv, { struct mq_inflight *mi = priv; - if ((!mi->part->bd_partno || rq->part == mi->part) && + if (rq->part && blk_do_io_stat(rq) && + (!mi->part->bd_partno || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; @@ -2174,8 +2175,7 @@ static bool blk_mq_has_sqsched(struct request_queue *q) */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and @@ -2183,8 +2183,8 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) * just causes lock contention inside the scheduler and pointless cache * bouncing. */ - hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, - raw_smp_processor_id()); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx); + if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c007d58d2703..a24d4078fb21 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -105,6 +105,10 @@ typedef u16 blk_short_t; /* hack for device mapper, don't use elsewhere: */ #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) +/* + * BLK_STS_AGAIN should only be returned if RQF_NOWAIT is set + * and the bio would block (cf bio_wouldblock_error()) + */ #define BLK_STS_AGAIN ((__force blk_status_t)12) /* |