diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-22 02:48:55 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-22 02:48:55 +0300 |
commit | 616355cc818c6ddadc393fdfd4491f94458cb715 (patch) | |
tree | a96907b179ccbeb84cd1df5515cd10c90eb5bd59 /block | |
parent | b080cee72ef355669cbc52ff55dc513d37433600 (diff) | |
parent | 8f9e7b65f833cb9a4b2e2f54a049d74df394d906 (diff) | |
download | linux-616355cc818c6ddadc393fdfd4491f94458cb715.tar.xz |
Merge tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- BFQ cleanups and fixes (Yu, Zhang, Yahu, Paolo)
- blk-rq-qos completion fix (Tejun)
- blk-cgroup merge fix (Tejun)
- Add offline error return value to distinguish it from an IO error on
the device (Song)
- IO stats fixes (Zhang, Christoph)
- blkcg refcount fixes (Ming, Yu)
- Fix for indefinite dispatch loop softlockup (Shin'ichiro)
- blk-mq hardware queue management improvements (Ming)
- sbitmap dead code removal (Ming, John)
- Plugging merge improvements (me)
- Show blk-crypto capabilities in sysfs (Eric)
- Multiple delayed queue run improvement (David)
- Block throttling fixes (Ming)
- Start deprecating auto module loading based on dev_t (Christoph)
- bio allocation improvements (Christoph, Chaitanya)
- Get rid of bio_devname (Christoph)
- bio clone improvements (Christoph)
- Block plugging improvements (Christoph)
- Get rid of genhd.h header (Christoph)
- Ensure drivers use appropriate flush helpers (Christoph)
- Refcounting improvements (Christoph)
- Queue initialization and teardown improvements (Ming, Christoph)
- Misc fixes/improvements (Barry, Chaitanya, Colin, Dan, Jiapeng,
Lukas, Nian, Yang, Eric, Chengming)
* tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block: (127 commits)
block: cancel all throttled bios in del_gendisk()
block: let blkcg_gq grab request queue's refcnt
block: avoid use-after-free on throttle data
block: limit request dispatch loop duration
block/bfq-iosched: Fix spelling mistake "tenative" -> "tentative"
sr: simplify the local variable initialization in sr_block_open()
block: don't merge across cgroup boundaries if blkcg is enabled
block: fix rq-qos breakage from skipping rq_qos_done_bio()
block: flush plug based on hardware and software queue order
block: ensure plug merging checks the correct queue at least once
block: move rq_qos_exit() into disk_release()
block: do more work in elevator_exit
block: move blk_exit_queue into disk_release
block: move q_usage_counter release into blk_queue_release
block: don't remove hctx debugfs dir from blk_mq_exit_queue
block: move blkcg initialization/destroy into disk allocation/release handler
sr: implement ->free_disk to simplify refcounting
sd: implement ->free_disk to simplify refcounting
sd: delay calling free_opal_dev
sd: call sd_zbc_release_disk before releasing the scsi_device reference
...
Diffstat (limited to 'block')
47 files changed, 1430 insertions, 659 deletions
diff --git a/block/Kconfig b/block/Kconfig index d5d4197b7ed2..7eb5d6d53b3f 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,16 @@ menuconfig BLOCK if BLOCK +config BLOCK_LEGACY_AUTOLOAD + bool "Legacy autoloading support" + default y + help + Enable loading modules and creating block device instances based on + accesses through their device special file. This is a historic Linux + feature and makes no sense in a udev world where device files are + created on demand, but scripts that manually create device nodes and + then call losetup might rely on this behavior. + config BLK_RQ_ALLOC_TIME bool @@ -218,6 +228,9 @@ config BLK_PM config BLOCK_HOLDER_DEPRECATED bool +config BLK_MQ_STACKING + bool + source "block/Kconfig.iosched" endif # BLOCK diff --git a/block/Makefile b/block/Makefile index f38eaa612929..3950ecbc5c26 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ + blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c index 102837a37051..ce8de42a89be 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; - return 0;; + return 0; } static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) @@ -733,12 +733,15 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct inode *inode; inode = ilookup(blockdev_superblock, dev); - if (!inode) { + if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; + if (inode) + pr_warn_ratelimited( +"block device autoloading is deprecated and will be removed.\n"); } + if (!inode) + return NULL; /* switch from the inode reference to a device mode one: */ bdev = &BDEV_I(inode)->bdev; diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 24a5c5329bcd..420eda2589c0 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; + struct bfq_group *old_parent = bfqq_group(bfqq); /* + * No point to move bfqq to the same group, which can happen when + * root group is offlined + */ + if (old_parent == bfqg) + return; + + /* + * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group + * until elevator exit. + */ + if (bfqq == &bfqd->oom_bfqq) + return; + /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ @@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_and_blkg_put(bfqq_group(bfqq)); + bfqg_and_blkg_put(old_parent); if (entity->parent && entity->parent->last_bfqq_created == bfqq) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 36a66e97e3c2..29d2635a2a63 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (!bfqq->next_rq) return; - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); if (!__bfqq) { @@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->waker_detection_started = now_ns; bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, MAX_BFQQ_NAME_LENGTH); - bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); + bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; @@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, struct bfq_queue *bfqq, sector_t sector) { - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree; struct rb_node *parent, *node; struct bfq_queue *__bfqq; @@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + /* * Check delayed stable merge for rotational or non-queueing * devs. For this branch to be executed, bfqq must not be @@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; @@ -5181,7 +5191,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; struct bfq_queue *in_serv_queue; - bool waiting_rq, idle_timer_disabled; + bool waiting_rq, idle_timer_disabled = false; spin_lock_irq(&bfqd->lock); @@ -5189,14 +5199,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + if (in_serv_queue == bfqd->in_service_queue) { + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } spin_unlock_irq(&bfqd->lock); - - bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, - idle_timer_disabled); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, + idle_timer_disabled); return rq; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 07288b9da389..3b83e3d1c2e5 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -8,7 +8,6 @@ #include <linux/blktrace_api.h> #include <linux/hrtimer.h> -#include <linux/blk-cgroup.h> #include "blk-cgroup-rwstat.h" @@ -1051,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq; for (parent = NULL; entity ; entity = parent) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index b74cc0da118e..f8eb340381cf 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, #ifdef CONFIG_BFQ_GROUP_IOSCHED -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - /* * Returns true if this budget changes may let next_in_service->parent * become the next_in_service entity for its parent entity. @@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) #else /* CONFIG_BFQ_GROUP_IOSCHED */ -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - return bfqq->bfqd->root_group; -} - static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) { return false; @@ -519,7 +504,7 @@ unsigned short bfq_ioprio_to_weight(int ioprio) static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF); } static void bfq_get_entity(struct bfq_entity *entity) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0827b19820c5..6996e7bd66e9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -420,7 +420,6 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } -EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { diff --git a/block/bio.c b/block/bio.c index 4312a8085396..33979f306e9e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -15,7 +15,6 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> -#include <linux/blk-cgroup.h> #include <linux/highmem.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> @@ -24,6 +23,7 @@ #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" struct bio_alloc_cache { struct bio *free_list; @@ -249,12 +249,12 @@ static void bio_free(struct bio *bio) * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ -void bio_init(struct bio *bio, struct bio_vec *table, - unsigned short max_vecs) +void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, + unsigned short max_vecs, unsigned int opf) { bio->bi_next = NULL; - bio->bi_bdev = NULL; - bio->bi_opf = 0; + bio->bi_bdev = bdev; + bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; @@ -268,6 +268,8 @@ void bio_init(struct bio *bio, struct bio_vec *table, #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->bi_issue.value = 0; + if (bdev) + bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif @@ -293,6 +295,8 @@ EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset + * @bdev: block device to use the bio for + * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly @@ -300,11 +304,15 @@ EXPORT_SYMBOL(bio_init); * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ -void bio_reset(struct bio *bio) +void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); + bio->bi_bdev = bdev; + if (bio->bi_bdev) + bio_associate_blkg(bio); + bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); @@ -344,6 +352,20 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, unsigned int opf, gfp_t gfp) +{ + struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); + + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + + return new; +} +EXPORT_SYMBOL_GPL(blk_next_bio); + static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); @@ -400,8 +422,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs) /** * bio_alloc_bioset - allocate a bio for I/O + * @bdev: block device to allocate the bio for (can be %NULL) + * @nr_vecs: number of bvecs to pre-allocate + * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -430,15 +454,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, +struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, + unsigned int opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + /* should not use nobvec bioset for nr_vecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; /* @@ -475,23 +500,23 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, return NULL; bio = p + bs->front_pad; - if (nr_iovecs > BIO_INLINE_VECS) { + if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, nr_iovecs); - } else if (nr_iovecs) { - bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + bio_init(bio, bdev, bvl, nr_vecs, opf); + } else if (nr_vecs) { + bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); } else { - bio_init(bio, NULL, 0); + bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; @@ -522,7 +547,8 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); if (unlikely(!bio)) return NULL; - bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, + 0); bio->bi_pool = NULL; return bio; } @@ -702,80 +728,84 @@ void bio_put(struct bio *bio) } EXPORT_SYMBOL(bio_put); -/** - * __bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - * - * Caller must ensure that @bio_src is not freed before @bio. - */ -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { - WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); - - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); - if (bio_flagged(bio_src, BIO_REMAPPED)) + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); + + if (bio_crypt_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + if (bio_integrity(bio_src) && + bio_integrity_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + return 0; } -EXPORT_SYMBOL(__bio_clone_fast); /** - * bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from + * bio_alloc_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio_src: bio to clone from + * @gfp: allocation priority + * @bs: bio_set to allocate from + * + * Allocate a new bio that is a clone of @bio_src. The caller owns the returned + * bio, but not the actual data it points to. * - * Like __bio_clone_fast, only also allocates the returned bio + * The caller must ensure that the return bio is not freed before @bio_src. */ -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs) { - struct bio *b; + struct bio *bio; - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) + bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); + if (!bio) return NULL; - __bio_clone_fast(b, bio); - - if (bio_crypt_clone(b, bio, gfp_mask) < 0) - goto err_put; - - if (bio_integrity(bio) && - bio_integrity_clone(b, bio, gfp_mask) < 0) - goto err_put; - - return b; + if (__bio_clone(bio, bio_src, gfp) < 0) { + bio_put(bio); + return NULL; + } + bio->bi_io_vec = bio_src->bi_io_vec; -err_put: - bio_put(b); - return NULL; + return bio; } -EXPORT_SYMBOL(bio_clone_fast); +EXPORT_SYMBOL(bio_alloc_clone); -const char *bio_devname(struct bio *bio, char *buf) +/** + * bio_init_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio: bio to clone into + * @bio_src: bio to clone from + * @gfp: allocation priority + * + * Initialize a new bio in caller provided memory that is a clone of @bio_src. + * The caller owns the returned bio, but not the actual data it points to. + * + * The caller must ensure that @bio_src is not freed before @bio. + */ +int bio_init_clone(struct block_device *bdev, struct bio *bio, + struct bio *bio_src, gfp_t gfp) { - return bdevname(bio->bi_bdev, buf); + int ret; + + bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); + ret = __bio_clone(bio, bio_src, gfp); + if (ret) + bio_uninit(bio); + return ret; } -EXPORT_SYMBOL(bio_devname); +EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full @@ -1054,7 +1084,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { if (len > UINT_MAX || off > UINT_MAX) - return 0; + return false; return bio_add_page(bio, &folio->page, len, off) > 0; } @@ -1486,8 +1516,7 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); @@ -1541,7 +1570,7 @@ struct bio *bio_split(struct bio *bio, int sectors, if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return NULL; - split = bio_clone_fast(bio, gfp, bs); + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return NULL; @@ -1636,9 +1665,9 @@ EXPORT_SYMBOL(bioset_exit); * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated - * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). - * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to - * dispatch queued requests when the mempool runs out of space. + * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used + * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, @@ -1708,7 +1737,9 @@ EXPORT_SYMBOL(bioset_init_from_src); /** * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb * @kiocb: kiocb describing the IO + * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of iovecs to pre-allocate + * @opf: operation and flags for bio * @bs: bio_set to allocate from * * Description: @@ -1719,14 +1750,14 @@ EXPORT_SYMBOL(bioset_init_from_src); * MUST be done from process context, not hard/soft IRQ. * */ -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, - struct bio_set *bs) +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) - return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); + return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); cache = per_cpu_ptr(bs->cache, get_cpu()); if (cache->free_list) { @@ -1734,13 +1765,14 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, cache->free_list = bio->bi_next; cache->nr--; put_cpu(); - bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); + bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, + nr_vecs, opf); bio->bi_pool = bs; bio_set_flag(bio, BIO_PERCPU_CACHE); return bio; } put_cpu(); - bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); + bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); bio_set_flag(bio, BIO_PERCPU_CACHE); return bio; } diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index ee746919c41f..9f2723b34b75 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -6,7 +6,7 @@ #ifndef _BLK_CGROUP_RWSTAT_H #define _BLK_CGROUP_RWSTAT_H -#include <linux/blk-cgroup.h> +#include "blk-cgroup.h" enum blkg_rwstat_type { BLKG_RWSTAT_READ, diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 650f7e27989f..d53b0d69dd73 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -23,15 +23,14 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> -#include <linux/blk-cgroup.h> #include <linux/tracehook.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" +#include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" @@ -83,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->q) + blk_put_queue(blkg->q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -168,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; + if (!blk_get_queue(q)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); @@ -857,11 +861,11 @@ static void blkcg_fill_root_iostats(void) blk_queue_root_blkg(bdev_get_queue(bdev)); struct blkg_iostat tmp; int cpu; + unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - unsigned long flags; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += @@ -877,11 +881,11 @@ static void blkcg_fill_root_iostats(void) cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - - flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); - blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } } @@ -1176,6 +1180,8 @@ int blkcg_init_queue(struct request_queue *q) bool preloaded; int ret; + INIT_LIST_HEAD(&q->blkg_list); + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); if (!new_blkg) return -ENOMEM; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h new file mode 100644 index 000000000000..47e1e38390c9 --- /dev/null +++ b/block/blk-cgroup.h @@ -0,0 +1,494 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BLK_CGROUP_PRIVATE_H +#define _BLK_CGROUP_PRIVATE_H +/* + * block cgroup private header + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> + * + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> + * Paolo Valente <paolo.valente@unimore.it> + * + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> + * Nauman Rafique <nauman@google.com> + */ + +#include <linux/blk-cgroup.h> +#include <linux/blk-mq.h> + +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + +#ifdef CONFIG_BLK_CGROUP + +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; +}; + +/* + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. + */ +struct blkcg_policy_data { + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; + int plid; +}; + +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, + struct request_queue *q, struct blkcg *blkcg); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); + +struct blkcg_policy { + int plid; + /* cgroup files for the policy */ + struct cftype *dfl_cftypes; + struct cftype *legacy_cftypes; + + /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; + blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; + + blkcg_pol_alloc_pd_fn *pd_alloc_fn; + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_free_pd_fn *pd_free_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + blkcg_pol_stat_pd_fn *pd_stat_fn; +}; + +extern struct blkcg blkcg_root; +extern bool blkcg_debug_stats; + +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); +int blkcg_init_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + +/* Blkio controller policy registration */ +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +const char *blkg_dev_name(struct blkcg_gq *blkg); +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); + +struct blkg_conf_ctx { + struct block_device *bdev; + struct blkcg_gq *blkg; + char *body; +}; + +struct block_device *blkcg_conf_open_bdev(char **inputp); +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static inline struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + +/** + * __bio_blkcg - internal, inconsistent version to get blkcg + * + * DO NOT USE. + * This function is inconsistent and consequently is dangerous to use. The + * first part of the function returns a blkcg where a reference is owned by the + * bio. This means it does not need to be rcu protected as it cannot go away + * with the bio owning a reference to it. However, the latter potentially gets + * it from task_css(). This can race against task migration and the cgroup + * dying. It is also semantically different as it must be called rcu protected + * and is susceptible to failure when trying to get a reference to it. + * Therefore, it is not ok to assume that *_get() will always succeed on the + * blkcg returned here. + */ +static inline struct blkcg *__bio_blkcg(struct bio *bio) +{ + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; + return css_to_blkcg(blkcg_css()); +} + +/** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg + * @return: true if this bio needs to be submitted with the root blkg context. + * + * In order to avoid priority inversions we sometimes need to issue a bio as if + * it were attached to the root blkg, and then backcharge to the actual owning + * blkg. The idea is we do bio_blkcg() to look up the actual context for the + * bio and attach the appropriate blkg to the bio. Then we call this helper and + * if it is true run with the root blkg for that queue and then do any + * backcharging to the originating cgroup once the io is complete. + */ +static inline bool bio_issue_as_root_blkg(struct bio *bio) +{ + return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; +} + +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + if (blkcg == &blkcg_root) + return q->root_blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __blkg_lookup(blkcg, q, false); +} + +/** + * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair + * @q: request_queue of interest + * + * Lookup blkg for @q at the root level. See also blkg_lookup(). + */ +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ + return q->root_blkg; +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, + struct blkcg_policy *pol) +{ + return blkcg ? blkcg->cpd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + percpu_ref_get(&blkg->refcnt); +} + +/** + * blkg_tryget - try and get a blkg reference + * @blkg: blkg to get + * + * This is for use when doing an RCU lookup of the blkg. We may be in the midst + * of freeing this blkg, so we can only use it if the refcnt is not zero. + */ +static inline bool blkg_tryget(struct blkcg_gq *blkg) +{ + return blkg && percpu_ref_tryget(&blkg->refcnt); +} + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + percpu_ref_put(&blkg->refcnt); +} + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q, false))) + +bool __blkcg_punt_bio_submit(struct bio *bio); + +static inline bool blkcg_punt_bio_submit(struct bio *bio) +{ + if (bio->bi_opf & REQ_CGROUP_PUNT) + return __blkcg_punt_bio_submit(bio); + else + return false; +} + +static inline void blkcg_bio_issue_init(struct bio *bio) +{ + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +} + +static inline void blkcg_use_delay(struct blkcg_gq *blkg) +{ + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; + if (atomic_add_return(1, &blkg->use_delay) == 1) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); +} + +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + if (WARN_ON_ONCE(old < 0)) + return 0; + if (old == 0) + return 0; + + /* + * We do this song and dance because we can race with somebody else + * adding or removing delay. If we just did an atomic_dec we'd end up + * negative and we'd already be in trouble. We need to subtract 1 and + * then check to see if we were the last delay so we can drop the + * congestion count on the cgroup. + */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); + if (cur == old) + break; + old = cur; + } + + if (old == 0) + return 0; + if (old == 1) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + return 1; +} + +/** + * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount + * @blkg: target blkg + * @delay: delay duration in nsecs + * + * When enabled with this function, the delay is not decayed and must be + * explicitly cleared with blkcg_clear_delay(). Must not be mixed with + * blkcg_[un]use_delay() and blkcg_add_delay() usages. + */ +static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person setting the congestion count for this blkg. */ + if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + + atomic64_set(&blkg->delay_nsec, delay); +} + +/** + * blkcg_clear_delay - Disable allocator delay mechanism + * @blkg: target blkg + * + * Disable use_delay mechanism. See blkcg_set_delay(). + */ +static inline void blkcg_clear_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person clearing the congestion count for this blkg. */ + if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); +} + +/** + * blk_cgroup_mergeable - Determine whether to allow or disallow merges + * @rq: request to merge into + * @bio: bio to merge + * + * @bio and @rq should belong to the same cgroup and their issue_as_root should + * match. The latter is necessary as we don't want to throttle e.g. a metadata + * update because it happens to be next to a regular IO. + */ +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) +{ + return rq->bio->bi_blkg == bio->bi_blkg && + bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); +} + +void blk_cgroup_bio_start(struct bio *bio); +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); +#else /* CONFIG_BLK_CGROUP */ + +struct blkg_policy_data { +}; + +struct blkcg_policy_data { +}; + +struct blkcg_policy { +}; + +#ifdef CONFIG_BLOCK + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } +static inline void blkcg_bio_issue_init(struct bio *bio) { } +static inline void blk_cgroup_bio_start(struct bio *bio) { } +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLOCK */ +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _BLK_CGROUP_PRIVATE_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 779b4a1f66ac..9c14deab3af4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,7 +34,6 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> -#include <linux/blk-cgroup.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> @@ -49,6 +48,7 @@ #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" +#include "blk-cgroup.h" #include "blk-throttle.h" #include "blk-rq-qos.h" @@ -165,6 +165,7 @@ static const struct { [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + [BLK_STS_OFFLINE] = { -ENODEV, "device offline" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, @@ -339,8 +340,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); - percpu_ref_exit(&q->q_usage_counter); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } @@ -473,9 +472,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); -#ifdef CONFIG_BLK_CGROUP - INIT_LIST_HEAD(&q->blkg_list); -#endif kobject_init(&q->kobj, &blk_queue_ktype); @@ -496,17 +492,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_stats; - if (blkcg_init_queue(q)) - goto fail_ref; - blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; -fail_ref: - percpu_ref_exit(&q->q_usage_counter); fail_stats: blk_free_queue_stats(q->stats); fail_split: @@ -540,17 +531,6 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static void handle_bad_sector(struct bio *bio, sector_t maxsector) -{ - char b[BDEVNAME_SIZE]; - - pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%s: rw=%d, want=%llu, limit=%llu\n", - current->comm, - bio_devname(bio, b), bio->bi_opf, - bio_end_sector(bio), maxsector); -} - #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); @@ -580,14 +560,10 @@ late_initcall(fail_make_request_debugfs); static inline bool bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { - char b[BDEVNAME_SIZE]; - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return false; - - WARN_ONCE(1, - "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), bio->bi_bdev->bd_partno); + pr_warn("Trying to write to read-only block-device %pg\n", + bio->bi_bdev); /* Older lvm-tools actually trigger this */ return false; } @@ -616,7 +592,11 @@ static inline int bio_check_eod(struct bio *bio) if (nr_sectors && maxsector && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { - handle_bad_sector(bio, maxsector); + pr_info_ratelimited("%s: attempt to access beyond end of device\n" + "%pg: rw=%d, want=%llu, limit=%llu\n", + current->comm, + bio->bi_bdev, bio->bi_opf, + bio_end_sector(bio), maxsector); return -EIO; } return 0; @@ -676,7 +656,123 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -noinline_for_stack bool submit_bio_checks(struct bio *bio) +static void __submit_bio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + if (unlikely(!blk_crypto_bio_prep(&bio))) + return; + + if (!disk->fops->submit_bio) { + blk_mq_submit_bio(bio); + } else if (likely(bio_queue_enter(bio) == 0)) { + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } +} + +/* + * The loop in this function may be a bit non-obvious, and so deserves some + * explanation: + * + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure + * that), so we have a list with a single bio. + * - We pretend that we have just taken it off a longer list, so we assign + * bio_list to a pointer to the bio_list_on_stack, thus initialising the + * bio_list of new bios to be added. ->submit_bio() may indeed add some more + * bios through a recursive call to submit_bio_noacct. If it did, we find a + * non-NULL value in bio_list and re-enter the loop from the top. + * - In this case we really did just take the bio of the top of the list (no + * pretending) and so remove it from bio_list, and call into ->submit_bio() + * again. + * + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the current + * ->submit_bio_bio, but that haven't been processed yet. + */ +static void __submit_bio_noacct(struct bio *bio) +{ + struct bio_list bio_list_on_stack[2]; + + BUG_ON(bio->bi_next); + + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; + + do { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct bio_list lower, same; + + /* + * Create a fresh bio_list for all subordinate requests. + */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); + + __submit_bio(bio); + + /* + * Sort new bios into those for a lower level and those for the + * same level. + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bdev_get_queue(bio->bi_bdev)) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + + /* + * Now assemble so we handle the lowest level first. + */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + + current->bio_list = NULL; +} + +static void __submit_bio_noacct_mq(struct bio *bio) +{ + struct bio_list bio_list[2] = { }; + + current->bio_list = bio_list; + + do { + __submit_bio(bio); + } while ((bio = bio_list_pop(&bio_list[0]))); + + current->bio_list = NULL; +} + +void submit_bio_noacct_nocheck(struct bio *bio) +{ + /* + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list + * to collect a list of requests submited by a ->submit_bio method while + * it is active, and then process them after it returned. + */ + if (current->bio_list) + bio_list_add(¤t->bio_list[0], bio); + else if (!bio->bi_bdev->bd_disk->fops->submit_bio) + __submit_bio_noacct_mq(bio); + else + __submit_bio_noacct(bio); +} + +/** + * submit_bio_noacct - re-submit a bio to the block device layer for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This is a version of submit_bio() that shall only be used for I/O that is + * resubmitted to lower level drivers by stacking block drivers. All file + * systems and other upper level users of the block layer should use + * submit_bio() instead. + */ +void submit_bio_noacct(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); @@ -761,7 +857,7 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) } if (blk_throtl_bio(bio)) - return false; + return; blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); @@ -773,138 +869,14 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } - return true; + submit_bio_noacct_nocheck(bio); + return; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); - return false; -} - -static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) -{ - if (blk_crypto_bio_prep(&bio)) { - if (likely(bio_queue_enter(bio) == 0)) { - disk->fops->submit_bio(bio); - blk_queue_exit(disk->queue); - } - } -} - -static void __submit_bio(struct bio *bio) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - - if (unlikely(!submit_bio_checks(bio))) - return; - - if (!disk->fops->submit_bio) - blk_mq_submit_bio(bio); - else - __submit_bio_fops(disk, bio); -} - -/* - * The loop in this function may be a bit non-obvious, and so deserves some - * explanation: - * - * - Before entering the loop, bio->bi_next is NULL (as all callers ensure - * that), so we have a list with a single bio. - * - We pretend that we have just taken it off a longer list, so we assign - * bio_list to a pointer to the bio_list_on_stack, thus initialising the - * bio_list of new bios to be added. ->submit_bio() may indeed add some more - * bios through a recursive call to submit_bio_noacct. If it did, we find a - * non-NULL value in bio_list and re-enter the loop from the top. - * - In this case we really did just take the bio of the top of the list (no - * pretending) and so remove it from bio_list, and call into ->submit_bio() - * again. - * - * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. - * bio_list_on_stack[1] contains bios that were submitted before the current - * ->submit_bio_bio, but that haven't been processed yet. - */ -static void __submit_bio_noacct(struct bio *bio) -{ - struct bio_list bio_list_on_stack[2]; - - BUG_ON(bio->bi_next); - - bio_list_init(&bio_list_on_stack[0]); - current->bio_list = bio_list_on_stack; - - do { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct bio_list lower, same; - - /* - * Create a fresh bio_list for all subordinate requests. - */ - bio_list_on_stack[1] = bio_list_on_stack[0]; - bio_list_init(&bio_list_on_stack[0]); - - __submit_bio(bio); - - /* - * Sort new bios into those for a lower level and those for the - * same level. - */ - bio_list_init(&lower); - bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bdev_get_queue(bio->bi_bdev)) - bio_list_add(&same, bio); - else - bio_list_add(&lower, bio); - - /* - * Now assemble so we handle the lowest level first. - */ - bio_list_merge(&bio_list_on_stack[0], &lower); - bio_list_merge(&bio_list_on_stack[0], &same); - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); - } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); - - current->bio_list = NULL; -} - -static void __submit_bio_noacct_mq(struct bio *bio) -{ - struct bio_list bio_list[2] = { }; - - current->bio_list = bio_list; - - do { - __submit_bio(bio); - } while ((bio = bio_list_pop(&bio_list[0]))); - - current->bio_list = NULL; -} - -/** - * submit_bio_noacct - re-submit a bio to the block device layer for I/O - * @bio: The bio describing the location in memory and on the device. - * - * This is a version of submit_bio() that shall only be used for I/O that is - * resubmitted to lower level drivers by stacking block drivers. All file - * systems and other upper level users of the block layer should use - * submit_bio() instead. - */ -void submit_bio_noacct(struct bio *bio) -{ - /* - * We only want one ->submit_bio to be active at a time, else stack - * usage with stacked devices could be a problem. Use current->bio_list - * to collect a list of requests submited by a ->submit_bio method while - * it is active, and then process them after it returned. - */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bio->bi_bdev->bd_disk->fops->submit_bio) - __submit_bio_noacct_mq(bio); - else - __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); @@ -989,8 +961,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; - if (current->plug) - blk_flush_plug(current->plug, false); + blk_flush_plug(current->plug, false); if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) return 0; @@ -1272,7 +1243,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug(struct blk_plug *plug, bool from_schedule) +void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); @@ -1301,7 +1272,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { if (plug == current->plug) { - blk_flush_plug(plug, false); + __blk_flush_plug(plug, false); current->plug = NULL; } } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c87aba8584c6..18c8eafe20b9 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -10,7 +10,6 @@ #define pr_fmt(fmt) "blk-crypto-fallback: " fmt #include <crypto/skcipher.h> -#include <linux/blk-cgroup.h> #include <linux/blk-crypto.h> #include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> @@ -20,6 +19,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> +#include "blk-cgroup.h" #include "blk-crypto-internal.h" static unsigned int num_prealloc_bounce_pg = 32; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 2fb0d65a464c..e6818ffaddbf 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -11,6 +11,7 @@ /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ @@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION +int blk_crypto_sysfs_register(struct request_queue *q); + +void blk_crypto_sysfs_unregister(struct request_queue *q); + void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) #else /* CONFIG_BLK_INLINE_ENCRYPTION */ +static inline int blk_crypto_sysfs_register(struct request_queue *q) +{ + return 0; +} + +static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { } + static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c new file mode 100644 index 000000000000..fd93bd2f33b7 --- /dev/null +++ b/block/blk-crypto-sysfs.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + * + * sysfs support for blk-crypto. This file contains the code which exports the + * crypto capabilities of devices via /sys/block/$disk/queue/crypto/. + */ + +#include <linux/blk-crypto-profile.h> + +#include "blk-crypto-internal.h" + +struct blk_crypto_kobj { + struct kobject kobj; + struct blk_crypto_profile *profile; +}; + +struct blk_crypto_attr { + struct attribute attr; + ssize_t (*show)(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page); +}; + +static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) +{ + return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; +} + +static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +{ + return container_of(attr, struct blk_crypto_attr, attr); +} + +static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); +} + +static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", profile->num_slots); +} + +#define BLK_CRYPTO_RO_ATTR(_name) \ + static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + +BLK_CRYPTO_RO_ATTR(max_dun_bits); +BLK_CRYPTO_RO_ATTR(num_keyslots); + +static struct attribute *blk_crypto_attrs[] = { + &max_dun_bits_attr.attr, + &num_keyslots_attr.attr, + NULL, +}; + +static const struct attribute_group blk_crypto_attr_group = { + .attrs = blk_crypto_attrs, +}; + +/* + * The encryption mode attributes. To avoid hard-coding the list of encryption + * modes, these are initialized at boot time by blk_crypto_sysfs_init(). + */ +static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; +static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; + +static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + int mode_num = a - __blk_crypto_mode_attrs; + + if (profile->modes_supported[mode_num]) + return 0444; + return 0; +} + +static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + int mode_num = attr - __blk_crypto_mode_attrs; + + return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]); +} + +static const struct attribute_group blk_crypto_modes_attr_group = { + .name = "modes", + .attrs = blk_crypto_mode_attrs, + .is_visible = blk_crypto_mode_is_visible, +}; + +static const struct attribute_group *blk_crypto_attr_groups[] = { + &blk_crypto_attr_group, + &blk_crypto_modes_attr_group, + NULL, +}; + +static ssize_t blk_crypto_attr_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + return a->show(profile, a, page); +} + +static const struct sysfs_ops blk_crypto_attr_ops = { + .show = blk_crypto_attr_show, +}; + +static void blk_crypto_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); +} + +static struct kobj_type blk_crypto_ktype = { + .default_groups = blk_crypto_attr_groups, + .sysfs_ops = &blk_crypto_attr_ops, + .release = blk_crypto_release, +}; + +/* + * If the request_queue has a blk_crypto_profile, create the "crypto" + * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). + */ +int blk_crypto_sysfs_register(struct request_queue *q) +{ + struct blk_crypto_kobj *obj; + int err; + + if (!q->crypto_profile) + return 0; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->profile = q->crypto_profile; + + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, + "crypto"); + if (err) { + kobject_put(&obj->kobj); + return err; + } + q->crypto_kobject = &obj->kobj; + return 0; +} + +void blk_crypto_sysfs_unregister(struct request_queue *q) +{ + kobject_put(q->crypto_kobject); +} + +static int __init blk_crypto_sysfs_init(void) +{ + int i; + + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) { + struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i]; + + attr->attr.name = blk_crypto_modes[i].name; + attr->attr.mode = 0444; + attr->show = blk_crypto_mode_show; + blk_crypto_mode_attrs[i - 1] = &attr->attr; + } + return 0; +} +subsys_initcall(blk_crypto_sysfs_init); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index ec9efeeeca91..a496aaef85ba 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -19,16 +19,19 @@ const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, @@ -111,7 +114,6 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) *dst->bi_crypt_context = *src->bi_crypt_context; return 0; } -EXPORT_SYMBOL_GPL(__bio_crypt_clone); /* Increments @dun by @inc, treating @dun as a multi-limb integer. */ void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], diff --git a/block/blk-flush.c b/block/blk-flush.c index e4df894189ce..c68968724870 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -460,9 +460,7 @@ int blkdev_issue_flush(struct block_device *bdev) { struct bio bio; - bio_init(&bio, NULL, 0); - bio_set_dev(&bio, bdev); - bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 769b64394298..70a0a3d680a3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -178,12 +178,12 @@ #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> -#include <linux/blk-cgroup.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 6593c7123b97..2f33932e72e3 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -74,9 +74,9 @@ #include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/blk-mq.h> -#include <linux/blk-cgroup.h> #include "blk-rq-qos.h" #include "blk-stat.h" +#include "blk-cgroup.h" #include "blk.h" #define DEFAULT_SCALE_COOKIE 1000000U @@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 2e7f10e1c03f..79e797f5d194 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -12,11 +12,11 @@ * Documentation/admin-guide/cgroup-v2.rst. */ -#include <linux/blk-cgroup.h> #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> +#include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" diff --git a/block/blk-lib.c b/block/blk-lib.c index 9f09beadcbe3..fc6ea52e7482 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -10,19 +10,6 @@ #include "blk.h" -struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) -{ - struct bio *new = bio_alloc(gfp, nr_pages); - - if (bio) { - bio_chain(bio, new); - submit_bio(bio); - } - - return new; -} -EXPORT_SYMBOL_GPL(blk_next_bio); - int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop) @@ -32,9 +19,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, unsigned int op; sector_t bs_mask, part_offset = 0; - if (!q) - return -ENXIO; - if (bdev_read_only(bdev)) return -EPERM; @@ -95,11 +79,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, WARN_ON_ONCE((req_sects << 9) > UINT_MAX); - bio = blk_next_bio(bio, 0, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, op, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, op, 0); - bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; nr_sects -= req_sects; @@ -172,9 +153,6 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, struct bio *bio = *biop; sector_t bs_mask; - if (!q) - return -ENXIO; - if (bdev_read_only(bdev)) return -EPERM; @@ -189,14 +167,12 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, max_write_same_sectors = bio_allowed_max_sectors(q); while (nr_sects) { - bio = blk_next_bio(bio, 1, gfp_mask); + bio = blk_next_bio(bio, bdev, 1, REQ_OP_WRITE_SAME, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0); if (nr_sects > max_write_same_sectors) { bio->bi_iter.bi_size = max_write_same_sectors << 9; @@ -250,10 +226,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, { struct bio *bio = *biop; unsigned int max_write_zeroes_sectors; - struct request_queue *q = bdev_get_queue(bdev); - - if (!q) - return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; @@ -265,10 +237,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, return -EOPNOTSUPP; while (nr_sects) { - bio = blk_next_bio(bio, 0, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE_ZEROES; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; @@ -304,23 +274,17 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; int bi_size = 0; unsigned int sz; - if (!q) - return -ENXIO; - if (bdev_read_only(bdev)) return -EPERM; while (nr_sects != 0) { - bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), - gfp_mask); + bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), + REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE, nr_sects << 9); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4de34a332c9f..ea6968313b4a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,6 +9,7 @@ #include <linux/blk-integrity.h> #include <linux/scatterlist.h> #include <linux/part_stat.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> @@ -368,8 +369,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; - - blk_throtl_charge_bio_split(*bio); } } @@ -600,6 +599,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { + if (!blk_cgroup_mergeable(req, bio)) + goto no_merge; + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; @@ -696,6 +698,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; + if (!blk_cgroup_mergeable(req, next->bio)) + return 0; + if (blk_integrity_merge_rq(q, req, next) == false) return 0; @@ -904,6 +909,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) + return false; + /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; @@ -1089,12 +1098,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (!plug || rq_list_empty(plug->mq_list)) return false; - /* check the previously added entry for a quick merge attempt */ - rq = rq_list_peek(&plug->mq_list); - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q == q) { + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; + } + + /* + * Only keep iterating plug list for merges if we have multiple + * queues + */ + if (!plug->multiple_queues) + break; } return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3a790eb4995c..e2880f6deb34 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -707,7 +707,7 @@ static void debugfs_create_files(struct dentry *parent, void *data, void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); @@ -780,7 +780,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); @@ -789,7 +789,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a68aa6041a10..69918f4170d6 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -6,6 +6,8 @@ #include <linux/seq_file.h> +struct blk_mq_hw_ctx; + struct blk_mq_debugfs_attr { const char *name; umode_t mode; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55488ba97823..9e56a69422b6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { + unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); - } while (ret == 1); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); return ret; } @@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { @@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { - unsigned int i, flags = q->tag_set->flags; + unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; + unsigned long i; int ret; if (!e) { @@ -618,7 +626,7 @@ err_free_map_and_rqs: void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; if (blk_mq_is_shared_tags(q->tag_set->flags)) { blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, @@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q) void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 674786574075..c08426975856 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; lockdep_assert_held(&q->sysfs_dir_lock); @@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q) int __blk_mq_register_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int ret, i; + unsigned long i, j; + int ret; WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); @@ -278,8 +279,10 @@ out: return ret; unreg: - while (--i >= 0) - blk_mq_unregister_hctx(q->queue_hw_ctx[i]); + queue_for_each_hw_ctx(q, hctx, j) { + if (j < i) + blk_mq_unregister_hctx(hctx); + } kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); @@ -290,7 +293,7 @@ unreg: void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) @@ -306,7 +309,8 @@ unlock: int blk_mq_sysfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 845f74e8dd7b..68ac23d0b640 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -107,7 +107,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, return BLK_MQ_NO_TAG; if (data->shallow_depth) - return __sbitmap_queue_get_shallow(bt, data->shallow_depth); + return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } @@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ @@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 9a9185a0a2d1..8e659dc5fcf3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; + return xa_load(&q->hctx_table, + (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); } static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, @@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) @@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = q->queue_hw_ctx[hctx_idx]; + data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -793,8 +794,10 @@ bool blk_update_request(struct request *req, blk_status_t error, #endif if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) + !(req->rq_flags & RQF_QUIET))) { blk_print_req_error(req, error); + trace_block_rq_error(req, error, nr_bytes); + } blk_account_io_completion(req, nr_bytes); @@ -885,10 +888,15 @@ static inline void blk_account_io_done(struct request *req, u64 now) static void __blk_account_io_start(struct request *rq) { - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (rq->bio) rq->part = rq->bio->bi_bdev; - else if (rq->q->disk) + else rq->part = rq->q->disk->part0; part_stat_lock(); @@ -1444,7 +1452,7 @@ static void blk_mq_timeout_work(struct work_struct *work) container_of(work, struct request_queue, timeout_work); unsigned long next = 0; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting @@ -2145,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2173,7 +2181,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2182,6 +2190,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) if (blk_mq_hctx_stopped(hctx)) continue; /* + * If there is already a run_work pending, leave the + * pending delay untouched. Otherwise, a hctx can stall + * if another hctx is re-delaying the other's work + * before the work executes. + */ + if (delayed_work_pending(&hctx->run_work)) + continue; + /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. @@ -2203,7 +2219,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); bool blk_mq_queue_stopped(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hctx_stopped(hctx)) @@ -2242,7 +2258,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); @@ -2260,7 +2276,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); @@ -2280,7 +2296,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async); @@ -2561,13 +2577,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q, q->mq_ops->queue_rqs(&plug->mq_list); } +static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +{ + struct blk_mq_hw_ctx *this_hctx = NULL; + struct blk_mq_ctx *this_ctx = NULL; + struct request *requeue_list = NULL; + unsigned int depth = 0; + LIST_HEAD(list); + + do { + struct request *rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + rq_list_add(&requeue_list, rq); + continue; + } + list_add_tail(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + plug->mq_list = requeue_list; + trace_block_unplug(this_hctx->queue, depth, !from_sched); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; struct request *rq; - unsigned int depth; - LIST_HEAD(list); if (rq_list_empty(plug->mq_list)) return; @@ -2603,35 +2642,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; } - this_hctx = NULL; - this_ctx = NULL; - depth = 0; do { - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; + blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2804,9 +2817,6 @@ void blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs = 1; blk_status_t ret; - if (unlikely(!blk_crypto_bio_prep(&bio))) - return; - blk_queue_bounce(q, &bio); if (blk_may_split(q, bio)) __blk_queue_split(q, &bio, &nr_segs); @@ -2853,27 +2863,16 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_try_issue_directly(rq->mq_hctx, rq)); } +#ifdef CONFIG_BLK_MQ_STACKING /** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @rq: the request being queued */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) +blk_status_t blk_insert_cloned_request(struct request *rq) { + struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* @@ -2905,24 +2904,7 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, return BLK_STS_IOERR; } - return BLK_STS_OK; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - - if (rq->q->disk && - should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -2935,7 +2917,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_run_dispatch_ops(rq->q, + blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, ktime_get_ns()); @@ -2990,10 +2972,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); + bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, + bs); if (!bio) goto free_and_out; - bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; @@ -3030,6 +3012,7 @@ free_and_out: return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); +#endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. @@ -3100,6 +3083,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct blk_mq_tags *drv_tags; struct page *page; + if (list_empty(&tags->page_list)) + return; + if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else @@ -3142,15 +3128,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) blk_mq_free_tags(tags); } +static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + int i; + + for (i = 0; i < set->nr_maps; i++) { + unsigned int start = set->map[i].queue_offset; + unsigned int end = start + set->map[i].nr_queues; + + if (hctx_idx >= start && hctx_idx < end) + break; + } + + if (i >= set->nr_maps) + i = HCTX_TYPE_DEFAULT; + + return i; +} + +static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + enum hctx_type type = hctx_idx_to_type(set, hctx_idx); + + return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); +} + static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { + int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3199,10 +3211,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; + int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3447,6 +3458,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_remove_cpuhp(hctx); + xa_erase(&q->hctx_table, hctx_idx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3456,12 +3469,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3486,8 +3498,15 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; + + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) + goto exit_flush_rq; + return 0; + exit_flush_rq: + if (set->ops->exit_request) + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -3647,7 +3666,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, j, hctx_idx; + unsigned int j, hctx_idx; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -3754,7 +3774,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { @@ -3854,7 +3874,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -3865,7 +3885,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because @@ -3954,52 +3974,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j, end; - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - - if (q->nr_hw_queues < set->nr_hw_queues) { - struct blk_mq_hw_ctx **new_hctxs; - - new_hctxs = kcalloc_node(set->nr_hw_queues, - sizeof(*new_hctxs), GFP_KERNEL, - set->numa_node); - if (!new_hctxs) - return; - if (hctxs) - memcpy(new_hctxs, hctxs, q->nr_hw_queues * - sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; - kfree(hctxs); - hctxs = new_hctxs; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { - int node; - struct blk_mq_hw_ctx *hctx; + int old_node; + int node = blk_mq_get_hctx_node(set, i); + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); - /* - * If the hw queue has been mapped to another numa node, - * we need to realloc the hctx. If allocation fails, fallback - * to use the previous one. - */ - if (hctxs[i] && (hctxs[i]->numa_node == node)) - continue; + if (old_hctx) { + old_node = old_hctx->numa_node; + blk_mq_exit_hctx(q, set, old_hctx, i); + } - hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (hctx) { - if (hctxs[i]) - blk_mq_exit_hctx(q, set, hctxs[i], i); - hctxs[i] = hctx; - } else { - if (hctxs[i]) - pr_warn("Allocate new hctx on node %d fails,\ - fallback to previous one on node %d\n", - node, hctxs[i]->numa_node); - else + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + if (!old_hctx) break; + pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", + node, old_node); + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); + WARN_ON_ONCE(!hctx); } } /* @@ -4008,24 +4004,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; - end = i; } else { j = i; - end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - for (; j < end; j++) { - struct blk_mq_hw_ctx *hctx = hctxs[j]; - - if (hctx) { - blk_mq_exit_hctx(q, set, hctx, j); - hctxs[j] = NULL; - } - } + xa_for_each_start(&q->hctx_table, j, hctx, j) + blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } +static void blk_mq_update_poll_flag(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); +} + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4050,6 +4049,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); + xa_init(&q->hctx_table); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4060,9 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); + blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); @@ -4081,7 +4080,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); q->nr_hw_queues = 0; blk_mq_sysfs_deinit(q); err_poll: @@ -4369,7 +4368,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int i, ret; + int ret; + unsigned long i; if (!set) return -EINVAL; @@ -4528,6 +4528,7 @@ fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4744,7 +4745,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q) { if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; cancel_delayed_work_sync(&q->requeue_work); diff --git a/block/blk-mq.h b/block/blk-mq.h index 948791ea2a3e..2615bd58bad3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; + return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 3cfbc8668cba..68267007da1c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ if (q->rq_qos) { - bio_set_flag(bio, BIO_TRACKED); + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } @@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9f32882ceb2f..85c4ba006671 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -10,7 +10,6 @@ #include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/blk-mq.h> -#include <linux/blk-cgroup.h> #include <linux/debugfs.h> #include "blk.h" @@ -18,6 +17,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-wbt.h" +#include "blk-cgroup.h" #include "blk-throttle.h" struct queue_sysfs_entry { @@ -739,27 +739,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } -/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ -static void blk_exit_queue(struct request_queue *q) -{ - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q); - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); -} - /** * blk_release_queue - releases all allocated resources of the request_queue * @kobj: pointer to a kobject, whose container is a request_queue @@ -787,12 +766,12 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); + percpu_ref_exit(&q->q_usage_counter); + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - blk_exit_queue(q); - blk_free_queue_stats(q->stats); kfree(q->poll_stat); @@ -880,6 +859,10 @@ int blk_register_queue(struct gendisk *disk) goto put_dev; } + ret = blk_crypto_sysfs_register(q); + if (ret) + goto put_dev; + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); blk_throtl_register_queue(q); @@ -910,6 +893,7 @@ unlock: return ret; put_dev: + elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); @@ -954,16 +938,18 @@ void blk_unregister_queue(struct gendisk *disk) */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); - - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); + blk_crypto_sysfs_unregister(q); blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->elevator) - elv_unregister_queue(q); + elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); + + /* Now that we've deleted all child objects, we can delete the queue. */ + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); + mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7c462c006b26..469c483719be 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,7 +10,6 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include <linux/blk-cgroup.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" @@ -42,11 +41,6 @@ /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ -}; - #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) /* We measure latency for request size from <= 4k to >= 1M */ @@ -426,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; + int has_iops_limit = 0; + + for (rw = READ; rw <= WRITE; rw++) { + unsigned int iops_limit = tg_iops_limit(tg, rw); - for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || (td->limit_valid[td->limit_index] && (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + iops_limit != UINT_MAX)); + + if (iops_limit != UINT_MAX) + has_iops_limit = 1; + } + + if (has_iops_limit) + tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; + else + tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -634,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - atomic_set(&tg->io_split_cnt[rw], 0); - /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -659,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - atomic_set(&tg->io_split_cnt[rw], 0); - throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -808,7 +810,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); - if (bps_limit == U64_MAX) { + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { if (wait) *wait = 0; return true; @@ -871,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { + if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || + tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; @@ -893,9 +897,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (iops_limit != UINT_MAX) - tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -920,9 +921,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio_size; + if (!bio_flagged(bio, BIO_THROTTLED)) { + tg->bytes_disp[rw] += bio_size; + tg->last_bytes_disp[rw] += bio_size; + } + tg->io_disp[rw]++; - tg->last_bytes_disp[rw] += bio_size; tg->last_io_disp[rw]++; /* @@ -1134,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret; + /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1219,7 +1233,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct(bio); + submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } @@ -1763,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } +void blk_throtl_cancel_bios(struct request_queue *q) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { @@ -1917,14 +1964,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { - tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { - tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2043,25 +2088,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -void blk_throtl_charge_bio_split(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - struct throtl_grp *parent = blkg_to_tg(blkg); - struct throtl_service_queue *parent_sq; - bool rw = bio_data_dir(bio); - - do { - if (!parent->has_rules[rw]) - break; - - atomic_inc(&parent->io_split_cnt[rw]); - atomic_inc(&parent->last_io_split_cnt[rw]); - - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - } while (parent); -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 175f03abd9e4..c1b602996127 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -52,6 +52,13 @@ struct throtl_service_queue { struct timer_list pending_timer; /* fires on first_pending_disptime */ }; +enum tg_state_flags { + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ +}; + enum { LIMIT_LOW, LIMIT_MAX, @@ -132,9 +139,6 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; - atomic_t io_split_cnt[2]; - atomic_t last_io_split_cnt[2]; - struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -158,20 +162,23 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } -static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } +static inline void blk_throtl_cancel_bios(struct request_queue *q) { } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct request_queue *q); void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); -void blk_throtl_charge_bio_split(struct bio *bio); bool __blk_throtl_bio(struct bio *bio); +void blk_throtl_cancel_bios(struct request_queue *q); static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); - if (bio_flagged(bio, BIO_THROTTLED)) + /* no need to throttle bps any more if the bio has been throttled */ + if (bio_flagged(bio, BIO_THROTTLED) && + !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) return false; + if (!tg->has_rules[bio_data_dir(bio)]) return false; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 774ecc598bee..602bef54c813 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -215,9 +215,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, continue; } - bio = blk_next_bio(bio, 0, gfp_mask); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC; + bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, + gfp_mask); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -239,10 +238,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) { struct bio bio; - bio_init(&bio, NULL, 0); - bio_set_dev(&bio, bdev); - bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; - + bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); return submit_bio_wait(&bio); } @@ -306,9 +302,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, } while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio_set_dev(bio, bdev); - bio->bi_opf = op | REQ_SYNC; + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); bio->bi_iter.bi_sector = sector; sector += zone_sectors; diff --git a/block/blk.h b/block/blk.h index 8bd43b3ad33d..6f21859c7f0f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -46,7 +46,7 @@ void blk_freeze_queue(struct request_queue *q); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); -bool submit_bio_checks(struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { @@ -325,7 +325,7 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; + return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } void update_io_ticks(struct block_device *part, unsigned long now, bool end); @@ -406,8 +406,6 @@ extern int blk_iolatency_init(struct request_queue *q); static inline int blk_iolatency_init(struct request_queue *q) { return 0; } #endif -struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); - #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); void blk_queue_clear_zone_settings(struct request_queue *q); @@ -426,6 +424,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); +void blk_drop_partitions(struct gendisk *disk); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, @@ -445,6 +444,9 @@ int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); +void disk_block_events(struct gendisk *disk); +void disk_unblock_events(struct gendisk *disk); +void disk_flush_events(struct gendisk *disk, unsigned int mask); extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; diff --git a/block/bounce.c b/block/bounce.c index 7af1a72835b9..3d50d19cde72 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -14,7 +14,6 @@ #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> -#include <linux/blk-cgroup.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> @@ -24,6 +23,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 @@ -162,15 +162,12 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * that does not own the bio - reason being drivers don't use it for * iterating over the biovec anymore, so expecting it to be kept up * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. + * asking for trouble and would force extra work. */ - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), - &bounce_bio_set); - bio->bi_bdev = bio_src->bi_bdev; + bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src), + bio_src->bi_opf, GFP_NOIO, &bounce_bio_set); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; diff --git a/block/disk-events.c b/block/disk-events.c index 8d5496e7592a..aee25a7e1ab7 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -4,7 +4,7 @@ */ #include <linux/export.h> #include <linux/moduleparam.h> -#include <linux/genhd.h> +#include <linux/blkdev.h> #include "blk.h" struct disk_events { diff --git a/block/elevator.c b/block/elevator.c index 482df2a350fc..c319765892bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -35,7 +35,6 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> -#include <linux/blk-cgroup.h> #include <trace/events/block.h> @@ -44,6 +43,7 @@ #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-wbt.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + ioc_clear_queue(q); + blk_mq_sched_free_rqs(q); + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -516,9 +519,11 @@ int elv_register_queue(struct request_queue *q, bool uevent) void elv_unregister_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->sysfs_lock); - if (q) { + if (e && e->registered) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -591,11 +596,7 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) - elv_unregister_queue(q); - - ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); + elv_unregister_queue(q); elevator_exit(q); } @@ -606,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { - blk_mq_sched_free_rqs(q); elevator_exit(q); goto out; } diff --git a/block/fops.c b/block/fops.c index a18e7fbd97b8..7ccc4ff109ce 100644 --- a/block/fops.c +++ b/block/fops.c @@ -75,8 +75,13 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, return -ENOMEM; } - bio_init(&bio, vecs, nr_pages); - bio_set_dev(&bio, bdev); + if (iov_iter_rw(iter) == READ) { + bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); + } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; @@ -88,14 +93,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, goto out; ret = bio.bi_iter.bi_size; - if (iov_iter_rw(iter) == READ) { - bio.bi_opf = REQ_OP_READ; - if (iter_is_iovec(iter)) - should_dirty = true; - } else { - bio.bi_opf = dio_bio_write_op(iocb); + if (iov_iter_rw(iter) == WRITE) task_io_account_write(ret); - } + if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) @@ -190,6 +190,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; + unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; @@ -197,7 +198,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); @@ -223,7 +224,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); for (;;) { - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; @@ -238,11 +238,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } if (is_read) { - bio->bi_opf = REQ_OP_READ; if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { - bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_NOWAIT) @@ -258,7 +256,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } atomic_inc(&dio->ref); submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); @@ -313,6 +311,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, unsigned int nr_pages) { struct block_device *bdev = iocb->ki_filp->private_data; + bool is_read = iov_iter_rw(iter) == READ; + unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; @@ -322,11 +322,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = iocb->ki_hint; bio->bi_end_io = blkdev_bio_end_io_async; @@ -349,14 +348,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, } dio->size = bio->bi_iter.bi_size; - if (iov_iter_rw(iter) == READ) { - bio->bi_opf = REQ_OP_READ; + if (is_read) { if (iter_is_iovec(iter)) { dio->flags |= DIO_SHOULD_DIRTY; bio_set_pages_dirty(bio); } } else { - bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } diff --git a/block/genhd.c b/block/genhd.c index 9eca1f7d35c9..37eb41ee4086 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,7 +8,6 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> -#include <linux/genhd.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> @@ -26,10 +25,12 @@ #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> +#include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" static struct kobject *block_depr; @@ -185,7 +186,9 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void (*probe)(dev_t devt); +#endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); static DEFINE_SPINLOCK(major_names_spinlock); @@ -275,7 +278,9 @@ int __register_blkdev(unsigned int major, const char *name, } p->major = major; +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; +#endif strlcpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); @@ -523,6 +528,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk_update_readahead(disk); disk_add_events(disk); + set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: @@ -636,7 +642,8 @@ void del_gendisk(struct gendisk *disk) blk_mq_freeze_queue_wait(q); - rq_qos_exit(q); + blk_throtl_cancel_bios(disk->queue); + blk_sync_queue(q); blk_flush_integrity(); /* @@ -693,6 +700,7 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); @@ -712,6 +720,7 @@ void blk_request_module(dev_t devt) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); } +#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ /* * print a full list of all partitions - intended for places where the root @@ -927,12 +936,17 @@ ssize_t part_stat_show(struct device *dev, struct disk_stats stat; unsigned int inflight; - part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) inflight = blk_mq_in_flight(q, bdev); else inflight = part_in_flight(bdev); + if (inflight) { + part_stat_lock(); + update_io_ticks(bdev, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(bdev, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1100,6 +1114,31 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; +static void disk_release_mq(struct request_queue *q) +{ + blk_mq_cancel_work_sync(q); + + /* + * There can't be any non non-passthrough bios in flight here, but + * requests stay around longer, including passthrough ones so we + * still need to freeze the queue here. + */ + blk_mq_freeze_queue(q); + + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + mutex_lock(&q->sysfs_lock); + elevator_exit(q); + mutex_unlock(&q->sysfs_lock); + } + rq_qos_exit(q); + __blk_mq_unfreeze_queue(q, true); +} + /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1121,13 +1160,21 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); - blk_mq_cancel_work_sync(disk->queue); + if (queue_is_mq(disk->queue)) + disk_release_mq(disk->queue); + + blkcg_exit_queue(disk->queue); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); + + if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) + disk->fops->free_disk(disk); + iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1188,12 +1235,17 @@ static int diskstats_show(struct seq_file *seqf, void *v) xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else inflight = part_in_flight(hd); + if (inflight) { + part_stat_lock(); + update_io_ticks(hd, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(hd, &stat); seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1322,6 +1374,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; + if (blkcg_init_queue(q)) + goto out_erase_part0; + rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1334,6 +1389,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #endif return disk; +out_erase_part0: + xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; diff --git a/block/holder.c b/block/holder.c index 27cddce1b446..8d750281a1cd 100644 --- a/block/holder.c +++ b/block/holder.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/genhd.h> +#include <linux/blkdev.h> #include <linux/slab.h> struct bd_holder_disk { diff --git a/block/partitions/check.h b/block/partitions/check.h index d5b28e309d64..4ffa2359b1a3 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pagemap.h> #include <linux/blkdev.h> -#include <linux/genhd.h> #include "../blk.h" /* diff --git a/block/partitions/core.c b/block/partitions/core.c index c2a1635922b1..2ef8dfa1e5c8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -8,7 +8,6 @@ #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> -#include <linux/genhd.h> #include <linux/vmalloc.h> #include <linux/blktrace_api.h> #include <linux/raid/detect.h> diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 8cc2b88d0aa8..84b9f36b9e47 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -13,7 +13,6 @@ #include <linux/types.h> #include <linux/fs.h> -#include <linux/genhd.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/string.h> diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 8693704dcf5e..0a747a0c782d 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -14,7 +14,6 @@ #include <linux/types.h> #include <linux/list.h> -#include <linux/genhd.h> #include <linux/fs.h> #include <asm/unaligned.h> #include <asm/byteorder.h> diff --git a/block/sed-opal.c b/block/sed-opal.c index daafadbb88ca..9700197000f2 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -13,7 +13,7 @@ #include <linux/device.h> #include <linux/kernel.h> #include <linux/list.h> -#include <linux/genhd.h> +#include <linux/blkdev.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <uapi/linux/sed-opal.h> |