From 41ee77b75308354054f4fe03a05b8016a0d41573 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:56 +0900 Subject: block: fix blk_zone_cond_str() comment Fix the comment for blk_zone_cond_str() by replacing the meaningless BLK_ZONE_ZONE_XXX comment with the correct BLK_ZONE_COND_name, thus also replacing the XXX with what that actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c..63affe898059 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ +/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) -- cgit v1.2.3 From 5e35a24c96185e1be4c24a713e53a49e92ab925b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:57 +0900 Subject: block: improve blk_op_str() comment Replace XXX with what it actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++++----- include/linux/blkdev.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index a0bf5174e9e9..d6732dc69dd9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -114,12 +114,12 @@ static const char *const blk_op_name[] = { #undef REQ_OP_NAME /** - * blk_op_str - Return string XXX in the REQ_OP_XXX. - * @op: REQ_OP_XXX. + * blk_op_str - Return the string "name" for an operation REQ_OP_name. + * @op: a request operation. * - * Description: Centralize block layer function to convert REQ_OP_XXX into - * string format. Useful in the debugging and tracing bio or request. For - * invalid REQ_OP_XXX it returns string "UNKNOWN". + * Convert a request operation REQ_OP_name into the string "name". Useful for + * debugging and tracing BIOs and requests. For an invalid request operation + * code, the string "UNKNOWN" is returned. */ inline const char *blk_op_str(enum req_op op) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 63affe898059..438c4946b6e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -/* Helper to convert REQ_OP_XXX to its string format XXX */ +/* Convert a request operation REQ_OP_name into the string "name" */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); -- cgit v1.2.3 From f7bc22ca0d55bdcb59e3a4a028fb811d23e53959 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:38 +0800 Subject: nvme/io_uring: optimize IOPOLL completions for local ring context When multiple io_uring rings poll on the same NVMe queue, one ring can find completions belonging to another ring. The current code always uses task_work to handle this, but this adds overhead for the common single-ring case. This patch passes the polling io_ring_ctx through io_comp_batch's new poll_ctx field. In io_do_iopoll(), the polling ring's context is stored in iob.poll_ctx before calling the iopoll callbacks. In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they match (local context), we complete inline with io_uring_cmd_done32(). If they differ (remote context) or iob is NULL (non-iopoll path), we use task_work as before. This optimization eliminates task_work scheduling overhead for the common case where a ring polls and finds its own completions. ~10% IOPS improvement is observed in the following benchmark: fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1 Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 20 +++++++++++++------- include/linux/blkdev.h | 1 + io_uring/rw.c | 6 ++++++ 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e45ac0ca174e..fb62633ccbb0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 438c4946b6e5..251e0f538c4c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1822,6 +1822,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f54..ff3192f603f3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. -- cgit v1.2.3 From 068f5b5ef5bf97e25568950f06ba32325bdc660b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:14 +0900 Subject: block: cleanup queue limit features definition Unwrap the definition of BLK_FEAT_ATOMIC_WRITES and renumber this feature to be sequential with BLK_FEAT_SKIP_TAGSET_QUIESCE. Signed-off-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251e0f538c4c..4536211ff33c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14)) + /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* atomic writes enabled */ -#define BLK_FEAT_ATOMIC_WRITES \ - ((__force blk_features_t)(1u << 16)) - /* * Flags automatically inherited when stacking limits. */ -- cgit v1.2.3 From 2719bd1ee1a1cd0535bc62e89b52822f2bbd14eb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:15 +0900 Subject: block: introduce blk_queue_rot() To check if a request queue is for a rotational device, a double negation is needed with the pattern "!blk_queue_nonrot(q)". Simplify this with the introduction of the helper blk_queue_rot() which tests if a requests queue limit has the BLK_FEAT_ROTATIONAL feature set. All call sites of blk_queue_nonrot() are modified to use blk_queue_rot() and blk_queue_nonrot() definition removed. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 20 ++++++++++---------- block/blk-iocost.c | 2 +- block/blk-iolatency.c | 5 +---- block/blk-wbt.c | 5 ++--- include/linux/blkdev.h | 4 ++-- 5 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6e54b1d3d8bc..3ebdec40e758 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -231,7 +231,7 @@ static struct kmem_cache *bfq_pool; #define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ (get_sdist(last_pos, rq) > \ BFQQ_SEEK_THR && \ - (!blk_queue_nonrot(bfqd->queue) || \ + (blk_queue_rot(bfqd->queue) || \ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) @@ -4165,7 +4165,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* don't use too short time intervals */ if (delta_usecs < 1000) { - if (blk_queue_nonrot(bfqd->queue)) + if (!blk_queue_rot(bfqd->queue)) /* * give same worst-case guarantees as idling * for seeky @@ -4487,7 +4487,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq) { bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + blk_queue_rot(bfqd->queue) && !bfqd->hw_tag, bfqq_sequential_and_IO_bound, idling_boosts_thr; @@ -4521,7 +4521,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, * flash-based device. */ idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + ((blk_queue_rot(bfqd->queue) || !bfqd->hw_tag) && bfqq_sequential_and_IO_bound); /* @@ -4722,7 +4722,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * there is only one in-flight large request * at a time. */ - if (blk_queue_nonrot(bfqd->queue) && + if (!blk_queue_rot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= BFQQ_SECT_THR_NONROT && bfqd->tot_rq_in_driver >= 1) @@ -6340,7 +6340,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) bfqd->hw_tag_samples = 0; bfqd->nonrot_with_queueing = - blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; + !blk_queue_rot(bfqd->queue) && bfqd->hw_tag; } static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) @@ -7293,7 +7293,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) INIT_HLIST_HEAD(&bfqd->burst_list); bfqd->hw_tag = -1; - bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); + bfqd->nonrot_with_queueing = !blk_queue_rot(bfqd->queue); bfqd->bfq_max_budget = bfq_default_max_budget; @@ -7328,9 +7328,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) * Begin by assuming, optimistically, that the device peak * rate is equal to 2/3 of the highest reference rate. */ - bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * - ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->rate_dur_prod = ref_rate[!blk_queue_rot(bfqd->queue)] * + ref_wr_duration[!blk_queue_rot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[!blk_queue_rot(bfqd->queue)] * 2 / 3; /* see comments on the definition of next field inside bfq_data */ bfqd->actuator_load_threshold = 4; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a0416927d33d..ef543d163d46 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -812,7 +812,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(disk->queue)) + if (blk_queue_rot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 45bd18f68541..f7434278cd29 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -988,10 +988,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) u64 now = blk_time_get_ns(); int cpu; - if (blk_queue_nonrot(blkg->q)) - iolat->ssd = true; - else - iolat->ssd = false; + iolat->ssd = !blk_queue_rot(blkg->q); for_each_possible_cpu(cpu) { struct latency_stat *stat; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0974875f77bd..8e025834f2fb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -747,10 +747,9 @@ u64 wbt_default_latency_nsec(struct request_queue *q) * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ - if (blk_queue_nonrot(q)) - return 2000000ULL; - else + if (blk_queue_rot(q)) return 75000000ULL; + return 2000000ULL; } static int wbt_data_dir(const struct request *rq) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4536211ff33c..1e5b5547929f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -680,7 +680,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) @@ -1463,7 +1463,7 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) static inline bool bdev_nonrot(struct block_device *bdev) { - return blk_queue_nonrot(bdev_get_queue(bdev)); + return !blk_queue_rot(bdev_get_queue(bdev)); } static inline bool bdev_synchronous(struct block_device *bdev) -- cgit v1.2.3 From da562d92e6755c00cd67845a8dbfb908dac51a9c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 30 Jan 2026 15:28:45 +0900 Subject: block: introduce bdev_rot() Introduce the helper function bdev_rot() to test if a block device is a rotational one. The existing function bdev_nonrot() which tests for the opposite condition is redefined using this new helper. This avoids the double negation (operator and name) that appears when testing if a block device is a rotational device, thus making the code a little easier to read. Call sites of bdev_nonrot() in the block layer are updated to use this new helper. Remaining users in other subsystems are left unchanged for now. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- drivers/block/loop.c | 2 +- drivers/nvme/target/admin-cmd.c | 4 ++-- include/linux/blkdev.h | 7 ++++++- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/ioctl.c b/block/ioctl.c index 344478348a54..fd48f82f9f03 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -692,7 +692,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !bdev_nonrot(bdev)); + return put_ushort(argp, bdev_rot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd59c0e9508b..ae3039584045 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; - if (backing_bdev && !bdev_nonrot(backing_bdev)) + if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3da31bb1183e..5e366502fb75 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -298,7 +298,7 @@ static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req) if (status) goto out; - if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) { + if (!req->ns->bdev || !bdev_rot(req->ns->bdev)) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -1084,7 +1084,7 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req) id->nmic = NVME_NS_NMIC_SHARED; if (req->ns->readonly) id->nsattr |= NVME_NS_ATTR_RO; - if (req->ns->bdev && !bdev_nonrot(req->ns->bdev)) + if (req->ns->bdev && bdev_rot(req->ns->bdev)) id->nsfeat |= NVME_NS_ROTATIONAL; /* * We need flush command to flush the file's metadata, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1e5b5547929f..2ae4c45e4959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,9 +1461,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_wzeroes_unmap_sectors; } +static inline bool bdev_rot(struct block_device *bdev) +{ + return blk_queue_rot(bdev_get_queue(bdev)); +} + static inline bool bdev_nonrot(struct block_device *bdev) { - return !blk_queue_rot(bdev_get_queue(bdev)); + return !bdev_rot(bdev); } static inline bool bdev_synchronous(struct block_device *bdev) -- cgit v1.2.3 From 9fc7900b14727d39457bd3724f26e6e3faca3efd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:42 +0800 Subject: block: convert nr_requests to unsigned int This value represents the number of requests for elevator tags, or drivers tags if elevator is none. The max value for elevator tags is 2048, and in drivers at most 16 bits is used for tag. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2ae4c45e4959..67d8d9e03abc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -550,7 +550,7 @@ struct request_queue { /* * queue settings */ - unsigned long nr_requests; /* Max # of requests */ + unsigned int nr_requests; /* Max # of requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; -- cgit v1.2.3 From f98afe4f31bb8b07fea318606c08030c2049587e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:45 +0800 Subject: blk-mq: add a new queue sysfs attribute async_depth Add a new field async_depth to request_queue and related APIs, this is currently not used, following patches will convert elevators to use this instead of internal async_depth. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 6 ++++++ block/blk-sysfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ block/elevator.c | 1 + include/linux/blkdev.h | 1 + 5 files changed, 51 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index d6732dc69dd9..474700ffaa1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -463,6 +463,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; + q->async_depth = BLKDEV_DEFAULT_RQ; return q; diff --git a/block/blk-mq.c b/block/blk-mq.c index b7b272e856b8..0ad3dd3329db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4662,6 +4662,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; + q->async_depth = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); @@ -5028,6 +5029,11 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, q->elevator->et = et; } + /* + * Preserve relative value, both nr and async_depth are at most 16 bit + * value, no need to worry about overflow. + */ + q->async_depth = max(q->async_depth * nr / q->nr_requests, 1); q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a580688c3ad5..003aa684e854 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -127,6 +127,46 @@ unlock: return ret; } +static ssize_t queue_async_depth_show(struct gendisk *disk, char *page) +{ + guard(mutex)(&disk->queue->elevator_lock); + + return queue_var_show(disk->queue->async_depth, page); +} + +static ssize_t +queue_async_depth_store(struct gendisk *disk, const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + unsigned long nr; + int ret; + + if (!queue_is_mq(q)) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr == 0) + return -EINVAL; + + memflags = blk_mq_freeze_queue(q); + scoped_guard(mutex, &q->elevator_lock) { + if (q->elevator) { + q->async_depth = min(q->nr_requests, nr); + if (q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + } else { + ret = -EINVAL; + } + } + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} + static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; @@ -532,6 +572,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_async_depth, "async_depth"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); @@ -719,6 +760,7 @@ static struct attribute *blk_mq_queue_attrs[] = { */ &elv_iosched_entry.attr, &queue_requests_entry.attr, + &queue_async_depth_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif diff --git a/block/elevator.c b/block/elevator.c index a2f8b2251dc6..ebe2a1fcf011 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -589,6 +589,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; + q->async_depth = q->tag_set->queue_depth; } blk_add_trace_msg(q, "elv switch: %s", ctx->name); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 67d8d9e03abc..99ef8cd7673c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { * queue settings */ unsigned int nr_requests; /* Max # of requests */ + unsigned int async_depth; /* Max # of async requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; -- cgit v1.2.3 From 453daece381e60df20da16c49ccc6a9bc5c6515a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 11 Feb 2026 12:44:38 -0800 Subject: block: change return type to void Now that all the callers of __blkdev_issue_discard() have been changed to ignore its return value, change its return type from int to void. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-lib.c | 3 +-- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-lib.c b/block/blk-lib.c index 0be3acdc3eb5..3213afc7f0d5 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -60,7 +60,7 @@ struct bio *blk_alloc_discard_bio(struct block_device *bdev, return bio; } -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, +void __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct bio *bio; @@ -68,7 +68,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp_mask))) *biop = bio_chain_and_submit(*biop, bio); - return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99ef8cd7673c..d463b9b5a0a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1259,7 +1259,7 @@ extern void blk_io_schedule(void); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, +void __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); -- cgit v1.2.3