From fcf463b92a08686d1aeb1e66674a72eb7a8bfb9b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Dec 2025 11:41:24 +0200 Subject: types: move phys_vec definition to common header Move the struct phys_vec definition from block/blk-mq-dma.c to include/linux/types.h to make it available for use across the kernel. The phys_vec structure represents a physical address range with a length, which is used by the new physical address-based DMA mapping API. This structure is already used by the block layer and will be needed for DMA phys API users. Moving this definition to types.h provides a centralized location for this common data structure and eliminates code duplication across subsystems that need to work with physical address ranges. Signed-off-by: Leon Romanovsky Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index d4437e9c452c..d673747eda8a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -171,6 +171,11 @@ typedef u64 phys_addr_t; typedef u32 phys_addr_t; #endif +struct phys_vec { + phys_addr_t paddr; + size_t len; +}; + typedef phys_addr_t resource_size_t; /* -- cgit v1.2.3 From ee623c892aa59003fca173de0041abc2ccc2c72d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2025 11:00:55 +0800 Subject: block: use bvec iterator helper for bio_may_need_split() bio_may_need_split() uses bi_vcnt to determine if a bio has a single segment, but bi_vcnt is unreliable for cloned bios. Cloned bios share the parent's bi_io_vec array but iterate over a subset via bi_iter, so bi_vcnt may not reflect the actual segment count being iterated. Replace the bi_vcnt check with bvec iterator access via __bvec_iter_bvec(), comparing bi_iter.bi_size against the current bvec's length. This correctly handles both cloned and non-cloned bios. Move bi_io_vec into the first cache line adjacent to bi_iter. This is a sensible layout since bi_io_vec and bi_iter are commonly accessed together throughout the block layer - every bvec iteration requires both fields. This displaces bi_end_io to the second cache line, which is acceptable since bi_end_io and bi_private are always fetched together in bio_endio() anyway. The struct layout change requires bio_reset() to preserve and restore bi_io_vec across the memset, since it now falls within BIO_RESET_BYTES. Nitesh verified that this patch doesn't regress NVMe 512-byte IO perf [1]. Link: https://lore.kernel.org/linux-block/20251220081607.tvnrltcngl3cc2fh@green245.gost/ [1] Signed-off-by: Ming Lei Reviewed-by: Nitesh Shetty Signed-off-by: Jens Axboe --- block/bio.c | 3 +++ block/blk.h | 12 +++++++++--- include/linux/blk_types.h | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index e726c0e280a8..0e936288034e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -301,9 +301,12 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { + struct bio_vec *bv = bio->bi_io_vec; + bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); + bio->bi_io_vec = bv; bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); diff --git a/block/blk.h b/block/blk.h index e4c433f62dfc..98f4dfd4ec75 100644 --- a/block/blk.h +++ b/block/blk.h @@ -371,12 +371,18 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { + const struct bio_vec *bv; + if (lim->chunk_sectors) return true; - if (bio->bi_vcnt != 1) + + if (!bio->bi_io_vec) + return true; + + bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + if (bio->bi_iter.bi_size > bv->bv_len) return true; - return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->max_fast_segment_size; + return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size; } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5dc061d318a4..19a888a2f104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,6 +232,8 @@ struct bio { atomic_t __bi_remaining; + /* The actual vec list, preserved by bio_reset() */ + struct bio_vec *bi_io_vec; struct bvec_iter bi_iter; union { @@ -275,8 +277,6 @@ struct bio { atomic_t __bi_cnt; /* pin count */ - struct bio_vec *bi_io_vec; /* the actual vec list */ - struct bio_set *bi_pool; }; -- cgit v1.2.3 From a3cc978e61f5c909ca94a38d2daeeddc051a18e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:43 +0100 Subject: blk-crypto: add a bio_crypt_ctx() helper This returns the bio_crypt_ctx if CONFIG_BLK_INLINE_ENCRYPTION is enabled and a crypto context is attached to the bio, else NULL. The use case is to allow safely dereferencing the context in common code without needed #ifdef CONFIG_BLK_INLINE_ENCRYPTION. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 58b0c5254a67..eb80df19be68 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -132,6 +132,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return bio->bi_crypt_context; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return bio->bi_crypt_context; +} + void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); @@ -169,6 +174,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return false; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return NULL; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); -- cgit v1.2.3 From bb8e2019ad613dd023a59bf91d1768018d17e09b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:49 +0100 Subject: blk-crypto: handle the fallback above the block layer Add a blk_crypto_submit_bio helper that either submits the bio when it is not encrypted or inline encryption is provided, but otherwise handles the encryption before going down into the low-level driver. This reduces the risk from bio reordering and keeps memory allocation as high up in the stack as possible. Note that if the submitter knows that inline enctryption is known to be supported by the underyling driver, it can still use plain submit_bio. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 6 ++++++ block/blk-core.c | 10 +++++++--- block/blk-crypto-internal.h | 19 +++++++++++-------- block/blk-crypto.c | 23 ++++++----------------- fs/buffer.c | 3 ++- fs/crypto/bio.c | 2 +- fs/ext4/page-io.c | 3 ++- fs/ext4/readpage.c | 9 +++++---- fs/f2fs/data.c | 4 ++-- fs/f2fs/file.c | 3 ++- fs/iomap/direct-io.c | 3 ++- include/linux/blk-crypto.h | 22 ++++++++++++++++++++++ 12 files changed, 68 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 6380e6ab492b..7e0703a12dfb 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -206,6 +206,12 @@ it to a bio, given the blk_crypto_key and the data unit number that will be used for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx later, as that happens automatically when the bio is freed or reset. +To submit a bio that uses inline encryption, users must call +``blk_crypto_submit_bio()`` instead of the usual ``submit_bio()``. This will +submit the bio to the underlying driver if it supports inline crypto, or else +call the blk-crypto fallback routines before submitting normal bios to the +underlying drivers. + Finally, when done using inline encryption with a blk_crypto_key on a block_device, users must call ``blk_crypto_evict_key()``. This ensures that the key is evicted from all keyslots it may be programmed into and unlinked from diff --git a/block/blk-core.c b/block/blk-core.c index f87e5f1a101f..a0bf5174e9e9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -628,9 +628,6 @@ static void __submit_bio(struct bio *bio) /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; - if (unlikely(!blk_crypto_bio_prep(bio))) - return; - blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { @@ -794,6 +791,13 @@ void submit_bio_noacct(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; + if (bio_has_crypt_ctx(bio)) { + if (WARN_ON_ONCE(!bio_has_data(bio))) + goto end_io; + if (!blk_crypto_supported(bio)) + goto not_supported; + } + if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index d65023120341..742694213529 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -86,6 +86,12 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp); +static inline bool blk_crypto_supported(struct bio *bio) +{ + return blk_crypto_config_supported_natively(bio->bi_bdev, + &bio->bi_crypt_context->bc_key->crypto_cfg); +} + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -139,6 +145,11 @@ static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, return -ENOTTY; } +static inline bool blk_crypto_supported(struct bio *bio) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -165,14 +176,6 @@ static inline void bio_crypt_do_front_merge(struct request *rq, #endif } -bool __blk_crypto_bio_prep(struct bio *bio); -static inline bool blk_crypto_bio_prep(struct bio *bio) -{ - if (bio_has_crypt_ctx(bio)) - return __blk_crypto_bio_prep(bio); - return true; -} - blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 0b2535d8dbcc..856d3c5b1fa0 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -242,25 +242,13 @@ void __blk_crypto_free_request(struct request *rq) rq->crypt_ctx = NULL; } -/** - * __blk_crypto_bio_prep - Prepare bio for inline encryption - * @bio: bio to prepare - * - * If the bio crypt context provided for the bio is supported by the underlying - * device's inline encryption hardware, do nothing. - * - * Otherwise, try to perform en/decryption for this bio by falling back to the - * kernel crypto API. For encryption this means submitting newly allocated - * bios for the encrypted payload while keeping back the source bio until they - * complete, while for reads the decryption happens in-place by a hooked in - * completion handler. - * - * Caller must ensure bio has bio_crypt_ctx. +/* + * Process a bio with a crypto context. Returns true if the caller should + * submit the passed in bio, false if the bio is consumed. * - * Return: true if @bio should be submitted to the driver by the caller, else - * false. Sets bio->bi_status, calls bio_endio and returns false on error. + * See the kerneldoc comment for blk_crypto_submit_bio for further details. */ -bool __blk_crypto_bio_prep(struct bio *bio) +bool __blk_crypto_submit_bio(struct bio *bio) { const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; struct block_device *bdev = bio->bi_bdev; @@ -288,6 +276,7 @@ bool __blk_crypto_bio_prep(struct bio *bio) return true; } +EXPORT_SYMBOL_GPL(__blk_crypto_submit_bio); int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..da18053f66e8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2821,7 +2822,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } - submit_bio(bio); + blk_crypto_submit_bio(bio); } void submit_bh(blk_opf_t opf, struct buffer_head *bh) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index c2b3ca100f8d..6da683ea69dc 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -105,7 +105,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, } atomic_inc(&done.pending); - submit_bio(bio); + blk_crypto_submit_bio(bio); } fscrypt_zeroout_range_done(&done); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 39abfeec5f36..a8c95eee91b7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -7,6 +7,7 @@ * Written by Theodore Ts'o, 2010. */ +#include #include #include #include @@ -401,7 +402,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; - submit_bio(io->io_bio); + blk_crypto_submit_bio(io->io_bio); } io->io_bio = NULL; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e7f2350c725b..49a6d36a8dba 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -345,7 +346,7 @@ int ext4_mpage_readpages(struct inode *inode, if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (bio == NULL) { @@ -371,14 +372,14 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_folio)) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) @@ -389,7 +390,7 @@ next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) - submit_bio(bio); + blk_crypto_submit_bio(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c30e69392a62..c3dd8a5c8589 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -513,7 +513,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -522,7 +522,7 @@ static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, WARN_ON_ONCE(is_read_io(bio_op(bio))); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7047ca6b98d..914790f37915 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include #include #include #include @@ -5046,7 +5047,7 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8e273408453a..4000c8596d9b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -74,7 +75,7 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, dio->dops->submit_io(iter, bio, pos); } else { WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); - submit_bio(bio); + blk_crypto_submit_bio(bio); } } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index eb80df19be68..f7c3cb4a342f 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -181,6 +181,28 @@ static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +bool __blk_crypto_submit_bio(struct bio *bio); + +/** + * blk_crypto_submit_bio - Submit a bio that may have a crypto context + * @bio: bio to submit + * + * If @bio has no crypto context, or the crypt context attached to @bio is + * supported by the underlying device's inline encryption hardware, just submit + * @bio. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. For encryption this means submitting newly allocated + * bios for the encrypted payload while keeping back the source bio until they + * complete, while for reads the decryption happens in-place by a hooked in + * completion handler. + */ +static inline void blk_crypto_submit_bio(struct bio *bio) +{ + if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio)) + submit_bio(bio); +} + int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context -- cgit v1.2.3 From 835042fb1971b1cc6acb46d53b8862643fd7d0a8 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:29 -0700 Subject: blk-integrity: take const pointer in blk_integrity_rq() blk_integrity_rq() doesn't modify the struct request passed in, so allow a const pointer to be passed. Use a matching signature for the !CONFIG_BLK_DEV_INTEGRITY version. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index a6b84206eb94..c15b1ac62765 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -91,7 +91,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, return bio_integrity_intervals(bi, sectors) * bi->metadata_size; } -static inline bool blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; } @@ -168,9 +168,9 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, { return 0; } -static inline int blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { - return 0; + return false; } static inline struct bio_vec rq_integrity_vec(struct request *rq) -- cgit v1.2.3 From 91e1c1bcf0f2376f40ac859cf17d0a64a605e662 Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Mon, 12 Jan 2026 20:08:08 +0530 Subject: block, nvme: remove unused dma_iova_state function parameter DMA IOVA state is not used inside blk_rq_dma_map_iter_next, get rid of the argument. Signed-off-by: Nitesh Shetty Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 3 +-- drivers/nvme/host/pci.c | 5 ++--- include/linux/blk-mq-dma.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index fb018fffffdc..4afeda45df15 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -238,7 +238,6 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * blk_rq_dma_map_iter_next - map the next DMA segment for a request * @req: request to map * @dma_dev: device to map to - * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next mapping after a previous call to @@ -253,7 +252,7 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * returned in @iter.status. */ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter) + struct blk_dma_iter *iter) { struct phys_vec vec; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0e4caeab739c..9fc4a60280a0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -823,7 +823,7 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, if (iter->len) return true; - if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + if (!blk_rq_dma_map_iter_next(req, dma_dev, iter)) return false; if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; @@ -1010,8 +1010,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, } nvme_pci_sgl_set_data(&sg_list[mapped++], iter); iod->total_len += iter->len; - } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, - iter)); + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, iter)); nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); if (unlikely(iter->status)) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index cb88fc791fbd..214c181ff2c9 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -28,7 +28,7 @@ struct blk_dma_iter { bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter); bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter); + struct blk_dma_iter *iter); /** * blk_rq_dma_map_coalesce - were all segments coalesced? -- cgit v1.2.3 From 41ee77b75308354054f4fe03a05b8016a0d41573 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:56 +0900 Subject: block: fix blk_zone_cond_str() comment Fix the comment for blk_zone_cond_str() by replacing the meaningless BLK_ZONE_ZONE_XXX comment with the correct BLK_ZONE_COND_name, thus also replacing the XXX with what that actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 10 +++++----- include/linux/blkdev.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1c54678fae6b..ef3872c53244 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -112,12 +112,12 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) /** - * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. - * @zone_cond: BLK_ZONE_COND_XXX. + * blk_zone_cond_str - Return a zone condition name string + * @zone_cond: a zone condition BLK_ZONE_COND_name * - * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX - * into string format. Useful in the debugging and tracing zone conditions. For - * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". + * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful + * for the debugging and tracing zone conditions. For an invalid zone + * conditions, the string "UNKNOWN" is returned. */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c..63affe898059 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ +/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) -- cgit v1.2.3 From 5e35a24c96185e1be4c24a713e53a49e92ab925b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:57 +0900 Subject: block: improve blk_op_str() comment Replace XXX with what it actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++++----- include/linux/blkdev.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index a0bf5174e9e9..d6732dc69dd9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -114,12 +114,12 @@ static const char *const blk_op_name[] = { #undef REQ_OP_NAME /** - * blk_op_str - Return string XXX in the REQ_OP_XXX. - * @op: REQ_OP_XXX. + * blk_op_str - Return the string "name" for an operation REQ_OP_name. + * @op: a request operation. * - * Description: Centralize block layer function to convert REQ_OP_XXX into - * string format. Useful in the debugging and tracing bio or request. For - * invalid REQ_OP_XXX it returns string "UNKNOWN". + * Convert a request operation REQ_OP_name into the string "name". Useful for + * debugging and tracing BIOs and requests. For an invalid request operation + * code, the string "UNKNOWN" is returned. */ inline const char *blk_op_str(enum req_op op) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 63affe898059..438c4946b6e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -/* Helper to convert REQ_OP_XXX to its string format XXX */ +/* Convert a request operation REQ_OP_name into the string "name" */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); -- cgit v1.2.3 From 5e2fde1a9433efc484a5feec36f748aa3ea58c85 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:37 +0800 Subject: block: pass io_comp_batch to rq_end_io_fn callback Add a third parameter 'const struct io_comp_batch *' to the rq_end_io_fn callback signature. This allows end_io handlers to access the completion batch context when requests are completed via blk_mq_end_request_batch(). The io_comp_batch is passed from blk_mq_end_request_batch(), while NULL is passed from __blk_mq_end_request() and blk_mq_put_rq_ref() which don't have batch context. This infrastructure change enables drivers to detect whether they're being called from a batched completion path (like iopoll) and access additional context stored in the io_comp_batch. Update all rq_end_io_fn implementations: - block/blk-mq.c: blk_end_sync_rq - block/blk-flush.c: flush_end_io, mq_flush_data_end_io - drivers/nvme/host/ioctl.c: nvme_uring_cmd_end_io - drivers/nvme/host/core.c: nvme_keep_alive_end_io - drivers/nvme/host/pci.c: abort_endio, nvme_del_queue_end, nvme_del_cq_end - drivers/nvme/target/passthru.c: nvmet_passthru_req_done - drivers/scsi/scsi_error.c: eh_lock_door_done - drivers/scsi/sg.c: sg_rq_end_io - drivers/scsi/st.c: st_scsi_execute_end - drivers/target/target_core_pscsi.c: pscsi_req_done - drivers/md/dm-rq.c: end_clone_request Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/blk-flush.c | 6 ++++-- block/blk-mq.c | 9 +++++---- drivers/md/dm-rq.c | 3 ++- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/ioctl.c | 3 ++- drivers/nvme/host/pci.c | 11 +++++++---- drivers/nvme/target/passthru.c | 3 ++- drivers/scsi/scsi_error.c | 3 ++- drivers/scsi/sg.c | 6 ++++-- drivers/scsi/st.c | 3 ++- drivers/target/target_core_pscsi.c | 6 ++++-- include/linux/blk-mq.h | 4 +++- 12 files changed, 39 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a4..403a46c86411 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq, } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..cf1daedbb39f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - if (rq->end_io(rq, error) == RQ_END_IO_FREE) + if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); @@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) * If end_io handler returns NONE, then it still has * ownership of the request. */ - if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1458,7 +1458,8 @@ struct blk_rq_wait { blk_status_t ret; }; -static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret, + const struct io_comp_batch *iob) { struct blk_rq_wait *wait = rq->end_io_data; @@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { - if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10..e9a7563b4b2f 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) } static enum rq_end_io_ret end_clone_request(struct request *clone, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct dm_rq_target_io *tio = clone->end_io_data; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7bf228df6001..19b67cf5d550 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a9c097dacad6..e45ac0ca174e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, - blk_status_t err) + blk_status_t err, + const struct io_comp_batch *iob) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 065555576d2f..d87c56c62861 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1615,7 +1615,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -2858,7 +2859,8 @@ out_unlock: } static enum rq_end_io_ret nvme_del_queue_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; @@ -2868,14 +2870,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req, } static enum rq_end_io_ret nvme_del_cq_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - return nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error, iob); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 96648ec2fadb..0823c87637d3 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) } static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) + blk_status_t blk_status, + const struct io_comp_batch *iob) { struct nvmet_req *req = rq->end_io_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f869108fd969..1e93390c5a82 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2085,7 +2085,8 @@ maybe_retry: } static enum rq_end_io_ret eh_lock_door_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 57fba34832ad..1a521f9d821a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static enum rq_end_io_ret -sg_rq_end_io(struct request *rq, blk_status_t status) +sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 168f25e4aaa3..8aeaa3b68c25 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) } static enum rq_end_io_ret st_scsi_execute_end(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index db4e09042469..823b2665f95b 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t, + const struct io_comp_batch *); /* pscsi_attach_hba(): * @@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) } static enum rq_end_io_ret pscsi_req_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cae9e857aea4..18a2388ba581 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -13,6 +13,7 @@ struct blk_mq_tags; struct blk_flush_queue; +struct io_comp_batch; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 @@ -22,7 +23,8 @@ enum rq_end_io_ret { RQ_END_IO_FREE, }; -typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t, + const struct io_comp_batch *); /* * request flags */ -- cgit v1.2.3 From f7bc22ca0d55bdcb59e3a4a028fb811d23e53959 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:38 +0800 Subject: nvme/io_uring: optimize IOPOLL completions for local ring context When multiple io_uring rings poll on the same NVMe queue, one ring can find completions belonging to another ring. The current code always uses task_work to handle this, but this adds overhead for the common single-ring case. This patch passes the polling io_ring_ctx through io_comp_batch's new poll_ctx field. In io_do_iopoll(), the polling ring's context is stored in iob.poll_ctx before calling the iopoll callbacks. In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they match (local context), we complete inline with io_uring_cmd_done32(). If they differ (remote context) or iob is NULL (non-iopoll path), we use task_work as before. This optimization eliminates task_work scheduling overhead for the common case where a ring polls and finds its own completions. ~10% IOPS improvement is observed in the following benchmark: fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1 Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 20 +++++++++++++------- include/linux/blkdev.h | 1 + io_uring/rw.c | 6 ++++++ 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e45ac0ca174e..fb62633ccbb0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 438c4946b6e5..251e0f538c4c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1822,6 +1822,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f54..ff3192f603f3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. -- cgit v1.2.3 From 72a41750f1a35b46caa5bbd70df7b5d3ce4f4b0a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 26 Jan 2026 08:27:24 -0800 Subject: block: remove bio_last_bvec_all There are no more callers of this function after commit f6b2d8b134b2413 ("btrfs: track the next file offset in struct btrfs_bio_ctrl"), so remove the function. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- Documentation/block/biovecs.rst | 1 - include/linux/bio.h | 6 ------ 2 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index b9dc0c9dbee4..11126ed6f40f 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -135,7 +135,6 @@ Usage of helpers: bio_first_bvec_all() bio_first_page_all() bio_first_folio_all() - bio_last_bvec_all() * The following helpers iterate over single-page segment. The passed 'struct bio_vec' will contain a single-page IO vector during the iteration:: diff --git a/include/linux/bio.h b/include/linux/bio.h index c75a9b3672aa..d32aee2857a9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -256,12 +256,6 @@ static inline struct folio *bio_first_folio_all(struct bio *bio) return page_folio(bio_first_page_all(bio)); } -static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) -{ - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - return &bio->bi_io_vec[bio->bi_vcnt - 1]; -} - /** * struct folio_iter - State for iterating all folios in a bio. * @folio: The current folio we're iterating. NULL after the last folio. -- cgit v1.2.3 From 068f5b5ef5bf97e25568950f06ba32325bdc660b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:14 +0900 Subject: block: cleanup queue limit features definition Unwrap the definition of BLK_FEAT_ATOMIC_WRITES and renumber this feature to be sequential with BLK_FEAT_SKIP_TAGSET_QUIESCE. Signed-off-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251e0f538c4c..4536211ff33c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14)) + /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* atomic writes enabled */ -#define BLK_FEAT_ATOMIC_WRITES \ - ((__force blk_features_t)(1u << 16)) - /* * Flags automatically inherited when stacking limits. */ -- cgit v1.2.3 From 2719bd1ee1a1cd0535bc62e89b52822f2bbd14eb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:15 +0900 Subject: block: introduce blk_queue_rot() To check if a request queue is for a rotational device, a double negation is needed with the pattern "!blk_queue_nonrot(q)". Simplify this with the introduction of the helper blk_queue_rot() which tests if a requests queue limit has the BLK_FEAT_ROTATIONAL feature set. All call sites of blk_queue_nonrot() are modified to use blk_queue_rot() and blk_queue_nonrot() definition removed. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 20 ++++++++++---------- block/blk-iocost.c | 2 +- block/blk-iolatency.c | 5 +---- block/blk-wbt.c | 5 ++--- include/linux/blkdev.h | 4 ++-- 5 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6e54b1d3d8bc..3ebdec40e758 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -231,7 +231,7 @@ static struct kmem_cache *bfq_pool; #define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ (get_sdist(last_pos, rq) > \ BFQQ_SEEK_THR && \ - (!blk_queue_nonrot(bfqd->queue) || \ + (blk_queue_rot(bfqd->queue) || \ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) @@ -4165,7 +4165,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* don't use too short time intervals */ if (delta_usecs < 1000) { - if (blk_queue_nonrot(bfqd->queue)) + if (!blk_queue_rot(bfqd->queue)) /* * give same worst-case guarantees as idling * for seeky @@ -4487,7 +4487,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq) { bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + blk_queue_rot(bfqd->queue) && !bfqd->hw_tag, bfqq_sequential_and_IO_bound, idling_boosts_thr; @@ -4521,7 +4521,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, * flash-based device. */ idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + ((blk_queue_rot(bfqd->queue) || !bfqd->hw_tag) && bfqq_sequential_and_IO_bound); /* @@ -4722,7 +4722,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * there is only one in-flight large request * at a time. */ - if (blk_queue_nonrot(bfqd->queue) && + if (!blk_queue_rot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= BFQQ_SECT_THR_NONROT && bfqd->tot_rq_in_driver >= 1) @@ -6340,7 +6340,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) bfqd->hw_tag_samples = 0; bfqd->nonrot_with_queueing = - blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; + !blk_queue_rot(bfqd->queue) && bfqd->hw_tag; } static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) @@ -7293,7 +7293,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) INIT_HLIST_HEAD(&bfqd->burst_list); bfqd->hw_tag = -1; - bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); + bfqd->nonrot_with_queueing = !blk_queue_rot(bfqd->queue); bfqd->bfq_max_budget = bfq_default_max_budget; @@ -7328,9 +7328,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) * Begin by assuming, optimistically, that the device peak * rate is equal to 2/3 of the highest reference rate. */ - bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * - ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->rate_dur_prod = ref_rate[!blk_queue_rot(bfqd->queue)] * + ref_wr_duration[!blk_queue_rot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[!blk_queue_rot(bfqd->queue)] * 2 / 3; /* see comments on the definition of next field inside bfq_data */ bfqd->actuator_load_threshold = 4; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a0416927d33d..ef543d163d46 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -812,7 +812,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(disk->queue)) + if (blk_queue_rot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 45bd18f68541..f7434278cd29 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -988,10 +988,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) u64 now = blk_time_get_ns(); int cpu; - if (blk_queue_nonrot(blkg->q)) - iolat->ssd = true; - else - iolat->ssd = false; + iolat->ssd = !blk_queue_rot(blkg->q); for_each_possible_cpu(cpu) { struct latency_stat *stat; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0974875f77bd..8e025834f2fb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -747,10 +747,9 @@ u64 wbt_default_latency_nsec(struct request_queue *q) * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ - if (blk_queue_nonrot(q)) - return 2000000ULL; - else + if (blk_queue_rot(q)) return 75000000ULL; + return 2000000ULL; } static int wbt_data_dir(const struct request *rq) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4536211ff33c..1e5b5547929f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -680,7 +680,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) @@ -1463,7 +1463,7 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) static inline bool bdev_nonrot(struct block_device *bdev) { - return blk_queue_nonrot(bdev_get_queue(bdev)); + return !blk_queue_rot(bdev_get_queue(bdev)); } static inline bool bdev_synchronous(struct block_device *bdev) -- cgit v1.2.3 From da562d92e6755c00cd67845a8dbfb908dac51a9c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 30 Jan 2026 15:28:45 +0900 Subject: block: introduce bdev_rot() Introduce the helper function bdev_rot() to test if a block device is a rotational one. The existing function bdev_nonrot() which tests for the opposite condition is redefined using this new helper. This avoids the double negation (operator and name) that appears when testing if a block device is a rotational device, thus making the code a little easier to read. Call sites of bdev_nonrot() in the block layer are updated to use this new helper. Remaining users in other subsystems are left unchanged for now. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- drivers/block/loop.c | 2 +- drivers/nvme/target/admin-cmd.c | 4 ++-- include/linux/blkdev.h | 7 ++++++- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/ioctl.c b/block/ioctl.c index 344478348a54..fd48f82f9f03 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -692,7 +692,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !bdev_nonrot(bdev)); + return put_ushort(argp, bdev_rot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd59c0e9508b..ae3039584045 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; - if (backing_bdev && !bdev_nonrot(backing_bdev)) + if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3da31bb1183e..5e366502fb75 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -298,7 +298,7 @@ static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req) if (status) goto out; - if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) { + if (!req->ns->bdev || !bdev_rot(req->ns->bdev)) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -1084,7 +1084,7 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req) id->nmic = NVME_NS_NMIC_SHARED; if (req->ns->readonly) id->nsattr |= NVME_NS_ATTR_RO; - if (req->ns->bdev && !bdev_nonrot(req->ns->bdev)) + if (req->ns->bdev && bdev_rot(req->ns->bdev)) id->nsfeat |= NVME_NS_ROTATIONAL; /* * We need flush command to flush the file's metadata, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1e5b5547929f..2ae4c45e4959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,9 +1461,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_wzeroes_unmap_sectors; } +static inline bool bdev_rot(struct block_device *bdev) +{ + return blk_queue_rot(bdev_get_queue(bdev)); +} + static inline bool bdev_nonrot(struct block_device *bdev) { - return !blk_queue_rot(bdev_get_queue(bdev)); + return !bdev_rot(bdev); } static inline bool bdev_synchronous(struct block_device *bdev) -- cgit v1.2.3 From 9fc7900b14727d39457bd3724f26e6e3faca3efd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:42 +0800 Subject: block: convert nr_requests to unsigned int This value represents the number of requests for elevator tags, or drivers tags if elevator is none. The max value for elevator tags is 2048, and in drivers at most 16 bits is used for tag. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2ae4c45e4959..67d8d9e03abc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -550,7 +550,7 @@ struct request_queue { /* * queue settings */ - unsigned long nr_requests; /* Max # of requests */ + unsigned int nr_requests; /* Max # of requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; -- cgit v1.2.3 From f98afe4f31bb8b07fea318606c08030c2049587e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:45 +0800 Subject: blk-mq: add a new queue sysfs attribute async_depth Add a new field async_depth to request_queue and related APIs, this is currently not used, following patches will convert elevators to use this instead of internal async_depth. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 6 ++++++ block/blk-sysfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ block/elevator.c | 1 + include/linux/blkdev.h | 1 + 5 files changed, 51 insertions(+) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index d6732dc69dd9..474700ffaa1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -463,6 +463,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; + q->async_depth = BLKDEV_DEFAULT_RQ; return q; diff --git a/block/blk-mq.c b/block/blk-mq.c index b7b272e856b8..0ad3dd3329db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4662,6 +4662,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; + q->async_depth = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); @@ -5028,6 +5029,11 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, q->elevator->et = et; } + /* + * Preserve relative value, both nr and async_depth are at most 16 bit + * value, no need to worry about overflow. + */ + q->async_depth = max(q->async_depth * nr / q->nr_requests, 1); q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a580688c3ad5..003aa684e854 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -127,6 +127,46 @@ unlock: return ret; } +static ssize_t queue_async_depth_show(struct gendisk *disk, char *page) +{ + guard(mutex)(&disk->queue->elevator_lock); + + return queue_var_show(disk->queue->async_depth, page); +} + +static ssize_t +queue_async_depth_store(struct gendisk *disk, const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + unsigned long nr; + int ret; + + if (!queue_is_mq(q)) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr == 0) + return -EINVAL; + + memflags = blk_mq_freeze_queue(q); + scoped_guard(mutex, &q->elevator_lock) { + if (q->elevator) { + q->async_depth = min(q->nr_requests, nr); + if (q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + } else { + ret = -EINVAL; + } + } + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} + static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; @@ -532,6 +572,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_async_depth, "async_depth"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); @@ -719,6 +760,7 @@ static struct attribute *blk_mq_queue_attrs[] = { */ &elv_iosched_entry.attr, &queue_requests_entry.attr, + &queue_async_depth_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif diff --git a/block/elevator.c b/block/elevator.c index a2f8b2251dc6..ebe2a1fcf011 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -589,6 +589,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; + q->async_depth = q->tag_set->queue_depth; } blk_add_trace_msg(q, "elv switch: %s", ctx->name); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 67d8d9e03abc..99ef8cd7673c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { * queue settings */ unsigned int nr_requests; /* Max # of requests */ + unsigned int async_depth; /* Max # of async requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; -- cgit v1.2.3