From b261d2222063a9a8b9ec284244c285f2998ee01e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 1 Apr 2025 15:16:00 -0700 Subject: lib/crc: remove CONFIG_LIBCRC32C Now that LIBCRC32C does nothing besides select CRC32, make every option that selects LIBCRC32C instead select CRC32 directly. Then remove LIBCRC32C. Reviewed-by: Christoph Hellwig Reviewed-by: "Martin K. Petersen" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250401221600.24878-8-ebiggers@kernel.org Signed-off-by: Eric Biggers --- drivers/block/Kconfig | 2 +- drivers/block/drbd/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a97f2c40c640..2551ebf88dda 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -367,7 +367,7 @@ config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK select CEPH_LIB - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO help diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig index 6fb4e38fca88..495a72da04c6 100644 --- a/drivers/block/drbd/Kconfig +++ b/drivers/block/drbd/Kconfig @@ -10,7 +10,7 @@ config BLK_DEV_DRBD tristate "DRBD Distributed Replicated Block Device support" depends on PROC_FS && INET select LRU_CACHE - select LIBCRC32C + select CRC32 help NOTE: In order to authenticate connections you have to select -- cgit v1.2.3 From 6ee6bd5d4fce502a5b5a2ea805e9ff16e6aa890f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Apr 2025 09:14:41 +0800 Subject: ublk: fix handling recovery & reissue in ublk_abort_queue() Commit 8284066946e6 ("ublk: grab request reference when the request is handled by userspace") doesn't grab request reference in case of recovery reissue. Then the request can be requeued & re-dispatch & failed when canceling uring command. If it is one zc request, the request can be freed before io_uring returns the zc buffer back, then cause kernel panic: [ 126.773061] BUG: kernel NULL pointer dereference, address: 00000000000000c8 [ 126.773657] #PF: supervisor read access in kernel mode [ 126.774052] #PF: error_code(0x0000) - not-present page [ 126.774455] PGD 0 P4D 0 [ 126.774698] Oops: Oops: 0000 [#1] SMP NOPTI [ 126.775034] CPU: 13 UID: 0 PID: 1612 Comm: kworker/u64:55 Not tainted 6.14.0_blk+ #182 PREEMPT(full) [ 126.775676] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 [ 126.776275] Workqueue: iou_exit io_ring_exit_work [ 126.776651] RIP: 0010:ublk_io_release+0x14/0x130 [ublk_drv] Fixes it by always grabbing request reference for aborting the request. Reported-by: Caleb Sander Mateos Closes: https://lore.kernel.org/linux-block/CADUfDZodKfOGUeWrnAxcZiLT+puaZX8jDHoj_sfHZCOZwhzz6A@mail.gmail.com/ Fixes: 8284066946e6 ("ublk: grab request reference when the request is handled by userspace") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250409011444.2142010-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2fd05c1bd30b..41bed67508f2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1140,6 +1140,25 @@ static void ublk_complete_rq(struct kref *ref) __ublk_complete_rq(req); } +static void ublk_do_fail_rq(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + blk_mq_requeue_request(req, false); + else + __ublk_complete_rq(req); +} + +static void ublk_fail_rq_fn(struct kref *ref) +{ + struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, + ref); + struct request *req = blk_mq_rq_from_pdu(data); + + ublk_do_fail_rq(req); +} + /* * Since ublk_rq_task_work_cb always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during @@ -1153,10 +1172,13 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) - blk_mq_requeue_request(req, false); - else - ublk_put_req_ref(ubq, req); + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_put(&data->ref, ublk_fail_rq_fn); + } else { + ublk_do_fail_rq(req); + } } static void ubq_complete_io_cmd(struct ublk_io *io, int res, -- cgit v1.2.3 From 18461f2a02be04f8bbbe3b37fecfc702e3fa5bc2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Apr 2025 09:14:42 +0800 Subject: ublk: don't fail request for recovery & reissue in case of ubq->canceling ubq->canceling is set with request queue quiesced when io_uring context is exiting. USER_RECOVERY or !RECOVERY_FAIL_IO requires request to be re-queued and re-dispatch after device is recovered. However commit d796cea7b9f3 ("ublk: implement ->queue_rqs()") still may fail any request in case of ubq->canceling, this way breaks USER_RECOVERY or !RECOVERY_FAIL_IO. Fix it by calling __ublk_abort_rq() in case of ubq->canceling. Reviewed-by: Uday Shankar Reported-by: Uday Shankar Closes: https://lore.kernel.org/linux-block/Z%2FQkkTRHfRxtN%2FmB@dev-ushankar.dev.purestorage.com/ Fixes: d796cea7b9f3 ("ublk: implement ->queue_rqs()") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250409011444.2142010-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 41bed67508f2..d6ca2f1097ad 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1371,7 +1371,8 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_RESET_TIMER; } -static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq) +static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, + bool check_cancel) { blk_status_t res; @@ -1390,7 +1391,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq) if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; - if (unlikely(ubq->canceling)) + if (check_cancel && unlikely(ubq->canceling)) return BLK_STS_IOERR; /* fill iod to slot in io cmd buffer */ @@ -1409,7 +1410,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; blk_status_t res; - res = ublk_prep_req(ubq, rq); + res = ublk_prep_req(ubq, rq, false); if (res != BLK_STS_OK) return res; @@ -1441,7 +1442,7 @@ static void ublk_queue_rqs(struct rq_list *rqlist) ublk_queue_cmd_list(ubq, &submit_list); ubq = this_q; - if (ublk_prep_req(ubq, req) == BLK_STS_OK) + if (ublk_prep_req(ubq, req, true) == BLK_STS_OK) rq_list_add_tail(&submit_list, req); else rq_list_add_tail(&requeue_list, req); -- cgit v1.2.3 From 843c6cec1af85f05971b7baf3704801895e77d76 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 8 Apr 2025 19:29:26 -0600 Subject: ublk: pass ublksrv_ctrl_cmd * instead of io_uring_cmd * The ublk_ctrl_*() handlers all take struct io_uring_cmd *cmd but only use it to get struct ublksrv_ctrl_cmd *header from the io_uring SQE. Since the caller ublk_ctrl_uring_cmd() has already computed header, pass it instead of cmd. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250409012928.3527198-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d6ca2f1097ad..cdb1543fa4a9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2436,9 +2436,9 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } -static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) +static int ublk_ctrl_start_dev(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); const struct ublk_param_basic *p = &ub->params.basic; int ublksrv_pid = (int)header->data[0]; struct queue_limits lim = { @@ -2557,9 +2557,8 @@ out_unlock: } static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; cpumask_var_t cpumask; unsigned long queue; @@ -2608,9 +2607,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) info->nr_hw_queues, info->queue_depth); } -static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublksrv_ctrl_dev_info info; struct ublk_device *ub; @@ -2835,9 +2833,8 @@ static int ublk_ctrl_stop_dev(struct ublk_device *ub) } static int ublk_ctrl_get_dev_info(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) @@ -2866,9 +2863,8 @@ static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) } static int ublk_ctrl_get_params(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; int ret; @@ -2897,9 +2893,8 @@ static int ublk_ctrl_get_params(struct ublk_device *ub, } static int ublk_ctrl_set_params(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; int ret = -EFAULT; @@ -2963,9 +2958,8 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) } static int ublk_ctrl_start_recovery(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ret = -EINVAL; int i; @@ -3011,9 +3005,8 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, } static int ublk_ctrl_end_recovery(struct ublk_device *ub, - struct io_uring_cmd *cmd) + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; int i; @@ -3060,9 +3053,8 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } -static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; u64 features = UBLK_F_ALL; @@ -3201,7 +3193,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, goto out; if (cmd_op == UBLK_U_CMD_GET_FEATURES) { - ret = ublk_ctrl_get_features(cmd); + ret = ublk_ctrl_get_features(header); goto out; } @@ -3218,17 +3210,17 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(ub, cmd); + ret = ublk_ctrl_start_dev(ub, header); break; case UBLK_CMD_STOP_DEV: ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: - ret = ublk_ctrl_get_dev_info(ub, cmd); + ret = ublk_ctrl_get_dev_info(ub, header); break; case UBLK_CMD_ADD_DEV: - ret = ublk_ctrl_add_dev(cmd); + ret = ublk_ctrl_add_dev(header); break; case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); @@ -3237,19 +3229,19 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(ub, cmd); + ret = ublk_ctrl_get_queue_affinity(ub, header); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(ub, cmd); + ret = ublk_ctrl_get_params(ub, header); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(ub, cmd); + ret = ublk_ctrl_set_params(ub, header); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(ub, cmd); + ret = ublk_ctrl_start_recovery(ub, header); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(ub, cmd); + ret = ublk_ctrl_end_recovery(ub, header); break; default: ret = -EOPNOTSUPP; -- cgit v1.2.3 From 3b607b75a345b1d808031bf1bb1038e4dac8d521 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 10 Apr 2025 17:47:23 +0200 Subject: null_blk: Use strscpy() instead of strscpy_pad() in null_add_dev() blk_mq_alloc_disk() already zero-initializes the destination buffer, making strscpy() sufficient for safely copying the disk's name. The additional NUL-padding performed by strscpy_pad() is unnecessary. If the destination buffer has a fixed length, strscpy() automatically determines its size using sizeof() when the argument is omitted. This makes the explicit size argument unnecessary. The source string is also NUL-terminated and meets the __must_be_cstr() requirement of strscpy(). No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Zhu Yanjun Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250410154727.883207-1-thorsten.blum@linux.dev Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 3bb9cee0a9b5..aa163ae9b2aa 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2031,7 +2031,7 @@ static int null_add_dev(struct nullb_device *dev) nullb->disk->minors = 1; nullb->disk->fops = &null_ops; nullb->disk->private_data = nullb; - strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + strscpy(nullb->disk->disk_name, nullb->disk_name); if (nullb->dev->zoned) { rv = null_register_zoned_dev(nullb); -- cgit v1.2.3 From 1fdb8188c3d505452b40cdb365b1bb32be533a8e Mon Sep 17 00:00:00 2001 From: Yunlong Xing Date: Mon, 14 Apr 2025 11:01:59 +0800 Subject: loop: aio inherit the ioprio of original request Set cmd->iocb.ki_ioprio to the ioprio of loop device's request. The purpose is to inherit the original request ioprio in the aio flow. Signed-off-by: Yunlong Xing Signed-off-by: Zhiguo Niu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250414030159.501180-1-yunlong.xing@unisoc.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 674527d770dc..dd7f33d47f4f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -447,7 +447,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_filp = file; cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; - cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + cmd->iocb.ki_ioprio = req_get_ioprio(rq); if (rw == ITER_SOURCE) ret = file->f_op->write_iter(&cmd->iocb, &iter); -- cgit v1.2.3 From e7bc0010ceb403d025100698586c8e760921d471 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 15 Apr 2025 10:51:47 +0200 Subject: loop: properly send KOBJ_CHANGED uevent for disk device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original commit message and the wording "uncork" in the code comment indicate that it is expected that the suppressed event instances are automatically sent after unsuppressing. This is not the case, instead they are discarded. In effect this means that no "changed" events are emitted on the device itself by default. While each discovered partition does trigger a changed event on the device, devices without partitions don't have any event emitted. This makes udev miss the device creation and prompted workarounds in userspace. See the linked util-linux/losetup bug. Explicitly emit the events and drop the confusingly worded comments. Link: https://github.com/util-linux/util-linux/issues/2434 Fixes: 498ef5c777d9 ("loop: suppress uevents while reconfiguring the device") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250415-loop-uevent-changed-v2-1-0c4e6a923b2a@linutronix.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index dd7f33d47f4f..3be7f00e7fc7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -667,8 +667,8 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, error = 0; done: - /* enable and uncork uevent now that we are done */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); return error; out_err: @@ -1129,8 +1129,8 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); - /* enable and uncork uevent now that we are done */ dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); loop_global_unlock(lo, is_loop); if (partscan) -- cgit v1.2.3 From 0dba7a05b9e47d8b546399117b0ddf2426dc6042 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 15 Apr 2025 16:55:06 +0200 Subject: loop: LOOP_SET_FD: send uevents for partitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the suppression of the uevents before scanning for partitions. The partitions inherit their suppression settings from their parent device, which lead to the uevents being dropped. This is similar to the same changes for LOOP_CONFIGURE done in commit bb430b694226 ("loop: LOOP_CONFIGURE: send uevents for partitions"). Fixes: 498ef5c777d9 ("loop: suppress uevents while reconfiguring the device") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20250415-loop-uevent-changed-v3-1-60ff69ac6088@linutronix.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3be7f00e7fc7..e9ec7a45f3f2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -662,12 +662,12 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, * dependency. */ fput(old_file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); if (partscan) loop_reread_partitions(lo); error = 0; done: - dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); return error; @@ -675,6 +675,7 @@ out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); goto done; } -- cgit v1.2.3 From f2fed441c69b9237760840a45a004730ff324faf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Apr 2025 15:09:40 +0200 Subject: loop: stop using vfs_iter_{read,write} for buffered I/O vfs_iter_{read,write} always perform direct I/O when the file has the O_DIRECT flag set, which breaks disabling direct I/O using the LOOP_SET_STATUS / LOOP_SET_STATUS64 ioctls. This was recenly reported as a regression, but as far as I can tell was only uncovered by better checking for block sizes and has been around since the direct I/O support was added. Fix this by using the existing aio code that calls the raw read/write iter methods instead. Note that despite the comments there is no need for block drivers to ever call flush_dcache_page themselves, and the call is a left-over from prehistoric times. Fixes: ab1cb278bc70 ("block: loop: introduce ioctl command of LOOP_SET_DIRECT_IO") Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20250409130940.3685677-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 112 ++++++++------------------------------------------- 1 file changed, 17 insertions(+), 95 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e9ec7a45f3f2..46cba261075f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -211,72 +211,6 @@ static void loop_set_size(struct loop_device *lo, loff_t size) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } -static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) -{ - struct iov_iter i; - ssize_t bw; - - iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); - - bw = vfs_iter_write(file, &i, ppos, 0); - - if (likely(bw == bvec->bv_len)) - return 0; - - printk_ratelimited(KERN_ERR - "loop: Write error at byte offset %llu, length %i.\n", - (unsigned long long)*ppos, bvec->bv_len); - if (bw >= 0) - bw = -EIO; - return bw; -} - -static int lo_write_simple(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec; - struct req_iterator iter; - int ret = 0; - - rq_for_each_segment(bvec, rq, iter) { - ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); - if (ret < 0) - break; - cond_resched(); - } - - return ret; -} - -static int lo_read_simple(struct loop_device *lo, struct request *rq, - loff_t pos) -{ - struct bio_vec bvec; - struct req_iterator iter; - struct iov_iter i; - ssize_t len; - - rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len); - len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); - if (len < 0) - return len; - - flush_dcache_page(bvec.bv_page); - - if (len != bvec.bv_len) { - struct bio *bio; - - __rq_for_each_bio(bio, rq) - zero_fill_bio(bio); - break; - } - cond_resched(); - } - - return 0; -} - static void loop_clear_limits(struct loop_device *lo, int mode) { struct queue_limits lim = queue_limits_start_update(lo->lo_queue); @@ -342,7 +276,7 @@ static void lo_complete_rq(struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); blk_status_t ret = BLK_STS_OK; - if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || + if (cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) ret = errno_to_blk_status(cmd->ret); @@ -358,14 +292,13 @@ static void lo_complete_rq(struct request *rq) cmd->ret = 0; blk_mq_requeue_request(rq, true); } else { - if (cmd->use_aio) { - struct bio *bio = rq->bio; + struct bio *bio = rq->bio; - while (bio) { - zero_fill_bio(bio); - bio = bio->bi_next; - } + while (bio) { + zero_fill_bio(bio); + bio = bio->bi_next; } + ret = BLK_STS_IOERR; end_io: blk_mq_end_request(rq, ret); @@ -445,9 +378,14 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } if (rw == ITER_SOURCE) ret = file->f_op->write_iter(&cmd->iocb, &iter); @@ -458,7 +396,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); - return 0; + return -EIOCBQUEUED; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) @@ -466,15 +404,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - /* - * lo_write_simple and lo_read_simple should have been covered - * by io submit style function like lo_rw_aio(), one blocker - * is that lo_read_simple() need to call flush_dcache_page after - * the page is written from kernel, and it isn't easy to handle - * this in io submit style function which submits all segments - * of the req at one time. And direct read IO doesn't need to - * run flush_dcache_page(). - */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); @@ -490,15 +419,9 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); - else - return lo_write_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_SOURCE); case REQ_OP_READ: - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, ITER_DEST); - else - return lo_read_simple(lo, rq, pos); + return lo_rw_aio(lo, cmd, pos, ITER_DEST); default: WARN_ON_ONCE(1); return -EIO; @@ -1922,7 +1845,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) struct loop_device *lo = rq->q->queuedata; int ret = 0; struct mem_cgroup *old_memcg = NULL; - const bool use_aio = cmd->use_aio; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; @@ -1952,7 +1874,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } failed: /* complete non-aio request */ - if (!use_aio || ret) { + if (ret != -EIOCBQUEUED) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else -- cgit v1.2.3 From 0b7a4817756c7906d0a8112c953ce88d7cd8d4c6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 15 Apr 2025 18:41:10 -0600 Subject: ublk: don't suggest CONFIG_BLK_DEV_UBLK=Y The CONFIG_BLK_DEV_UBLK help text suggests setting the config option to Y so task_work_add() can be used to dispatch I/O, improving performance. However, this mechanism was removed in commit 29dc5d06613f2 ("ublk: kill queuing request by task_work_add"). So remove this paragraph from the config help text. Signed-off-by: Caleb Sander Mateos Reviewed-by: Uday Shankar Link: https://lore.kernel.org/r/20250416004111.3242817-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a97f2c40c640..9e129fec4bb7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -388,12 +388,6 @@ config BLK_DEV_UBLK definition isn't finalized yet, and might change according to future requirement, so mark is as experimental now. - Say Y if you want to get better performance because task_work_add() - can be used in IO path for replacing io_uring cmd, which will become - shared between IO tasks and ubq daemon, meantime task_work_add() can - can handle batch more effectively, but task_work_add() isn't exported - for module, so ublk has to be built to kernel. - config BLKDEV_UBLK_LEGACY_OPCODES bool "Support legacy command opcode" depends on BLK_DEV_UBLK -- cgit v1.2.3 From b69b8edfb27dfa563cd53f590ec42b481f9eb174 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Wed, 16 Apr 2025 11:54:35 +0800 Subject: ublk: properly serialize all FETCH_REQs Most uring_cmds issued against ublk character devices are serialized because each command affects only one queue, and there is an early check which only allows a single task (the queue's ubq_daemon) to issue uring_cmds against that queue. However, this mechanism does not work for FETCH_REQs, since they are expected before ubq_daemon is set. Since FETCH_REQs are only used at initialization and not in the fast path, serialize them using the per-ublk-device mutex. This fixes a number of data races that were previously possible if a badly behaved ublk server decided to issue multiple FETCH_REQs against the same qid/tag concurrently. Reported-by: Caleb Sander Mateos Signed-off-by: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 77 ++++++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 28 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cdb1543fa4a9..bf47f9cb8329 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1832,8 +1832,8 @@ static void ublk_nosrv_work(struct work_struct *work) /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) + __must_hold(&ub->mutex) { - mutex_lock(&ub->mutex); ubq->nr_io_ready++; if (ublk_queue_ready(ubq)) { ubq->ubq_daemon = current; @@ -1845,7 +1845,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) complete_all(&ub->completion); - mutex_unlock(&ub->mutex); } static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, @@ -1929,6 +1928,52 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, return io_buffer_unregister_bvec(cmd, index, issue_flags); } +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, + struct ublk_io *io, __u64 buf_addr) +{ + struct ublk_device *ub = ubq->dev; + int ret = 0; + + /* + * When handling FETCH command for setting up ublk uring queue, + * ub->mutex is the innermost lock, and we won't block for handling + * FETCH, so it is fine even for IO_URING_F_NONBLOCK. + */ + mutex_lock(&ub->mutex); + /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ + if (ublk_queue_ready(ubq)) { + ret = -EBUSY; + goto out; + } + + /* allow each command to be FETCHed at most once */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + ret = -EINVAL; + goto out; + } + + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); + + if (ublk_need_map_io(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_need_get_data(ubq)) + goto out; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; + } + + ublk_fill_io_cmd(io, cmd, buf_addr); + ublk_mark_io_ready(ub, ubq); +out: + mutex_unlock(&ub->mutex); + return ret; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1985,33 +2030,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, case UBLK_IO_UNREGISTER_IO_BUF: return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); case UBLK_IO_FETCH_REQ: - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { - ret = -EBUSY; - goto out; - } - /* - * The io is being handled by server, so COMMIT_RQ is expected - * instead of FETCH_REQ - */ - if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - goto out; - - if (ublk_need_map_io(ubq)) { - /* - * FETCH_RQ has to provide IO buffer if NEED GET - * DATA is not enabled - */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; - } else if (ub_cmd->addr) { - /* User copy requires addr to be unset */ - ret = -EINVAL; + ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + if (ret) goto out; - } - - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); -- cgit v1.2.3 From 00b3b0d7cb454d614117c93f33351cdcd20b5b93 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Apr 2025 11:54:36 +0800 Subject: ublk: add ublk_force_abort_dev() Add ublk_force_abort_dev() for handling ublk_nosrv_dev_should_queue_io() in ublk_stop_dev(). Then queue quiesce and unquiesce can be paired in single function. Meantime not change device state to QUIESCED any more, since the disk is going to be removed soon. Reviewed-by: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bf47f9cb8329..e1b4db2f8a56 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1743,22 +1743,20 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) ub->dev_info.state = UBLK_S_DEV_QUIESCED; } -static void ublk_unquiesce_dev(struct ublk_device *ub) +static void ublk_force_abort_dev(struct ublk_device *ub) { int i; - pr_devel("%s: unquiesce ub: dev_id %d state %s\n", + pr_devel("%s: force abort ub: dev_id %d state %s\n", __func__, ub->dev_info.dev_id, ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED"); - /* quiesce_work has run. We let requeued rqs be aborted - * before running fallback_wq. "force_abort" must be seen - * after request queue is unqiuesced. Then del_gendisk() - * can move on. - */ + blk_mq_quiesce_queue(ub->ub_disk->queue); + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + ublk_wait_tagset_rqs_idle(ub); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_get_queue(ub, i)->force_abort = true; - blk_mq_unquiesce_queue(ub->ub_disk->queue); /* We may have requeued some rqs in ublk_quiesce_queue() */ blk_mq_kick_requeue_list(ub->ub_disk->queue); @@ -1786,11 +1784,8 @@ static void ublk_stop_dev(struct ublk_device *ub) mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) goto unlock; - if (ublk_nosrv_dev_should_queue_io(ub)) { - if (ub->dev_info.state == UBLK_S_DEV_LIVE) - __ublk_quiesce_dev(ub); - ublk_unquiesce_dev(ub); - } + if (ublk_nosrv_dev_should_queue_io(ub)) + ublk_force_abort_dev(ub); del_gendisk(ub->ub_disk); disk = ublk_detach_disk(ub); put_disk(disk); -- cgit v1.2.3 From 7e26cb69c5e62152a6f05a2ae23605a983a8ef31 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Apr 2025 11:54:37 +0800 Subject: ublk: rely on ->canceling for dealing with ublk_nosrv_dev_should_queue_io Now ublk deals with ublk_nosrv_dev_should_queue_io() by keeping request queue as quiesced. This way is fragile because queue quiesce crosses syscalls or process contexts. Switch to rely on ubq->canceling for dealing with ublk_nosrv_dev_should_queue_io(), because it has been used for this purpose during io_uring context exiting, and it can be reused before recovering too. In ublk_queue_rq(), the request will be added to requeue list without kicking off requeue in case of ubq->canceling, and finally requests added in requeue list will be dispatched from either ublk_stop_dev() or ublk_ctrl_end_recovery(). Meantime we have to move reset of ubq->canceling from ublk_ctrl_start_recovery() to ublk_ctrl_end_recovery(), when IO handling can be recovered completely. Then blk_mq_quiesce_queue() and blk_mq_unquiesce_queue() are always used in same context. Signed-off-by: Ming Lei Reviewed-by: Uday Shankar Link: https://lore.kernel.org/r/20250416035444.99569-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e1b4db2f8a56..a479969fd77e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1734,13 +1734,19 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) static void __ublk_quiesce_dev(struct ublk_device *ub) { + int i; + pr_devel("%s: quiesce ub: dev_id %d state %s\n", __func__, ub->dev_info.dev_id, ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED"); blk_mq_quiesce_queue(ub->ub_disk->queue); + /* mark every queue as canceling */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->canceling = true; ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; + blk_mq_unquiesce_queue(ub->ub_disk->queue); } static void ublk_force_abort_dev(struct ublk_device *ub) @@ -2961,7 +2967,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; ubq->timeout = false; - ubq->canceling = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -3048,20 +3053,18 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - if (ublk_nosrv_dev_should_queue_io(ub)) { - ub->dev_info.state = UBLK_S_DEV_LIVE; - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); - blk_mq_kick_requeue_list(ub->ub_disk->queue); - } else { - blk_mq_quiesce_queue(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - ublk_get_queue(ub, i)->fail_io = false; - } - blk_mq_unquiesce_queue(ub->ub_disk->queue); + blk_mq_quiesce_queue(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + ubq->canceling = false; + ubq->fail_io = false; } + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); ret = 0; out_unlock: -- cgit v1.2.3 From 728cbac5fe219d3b8a21a0688a08f2b7f8aeda2b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Apr 2025 11:54:38 +0800 Subject: ublk: move device reset into ublk_ch_release() ublk_ch_release() is called after ublk char device is closed, when all uring_cmd are done, so it is perfect fine to move ublk device reset to ublk_ch_release() from ublk_ctrl_start_recovery(). This way can avoid to grab the exiting daemon task_struct too long. However, reset of the following ublk IO flags has to be moved until ublk io_uring queues are ready: - ubq->canceling For requeuing IO in case of ublk_nosrv_dev_should_queue_io() before device is recovered - ubq->fail_io For failing IO in case of UBLK_F_USER_RECOVERY_FAIL_IO before device is recovered - ublk_io->flags For preventing using io->cmd With this way, recovery is simplified a lot. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 121 ++++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 49 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a479969fd77e..1fe39cf85b2f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1074,7 +1074,7 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) { - return ubq->ubq_daemon->flags & PF_EXITING; + return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING; } /* todo: handle partial completion */ @@ -1470,6 +1470,37 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + /* All old ioucmds have to be completed */ + ubq->nr_io_ready = 0; + + /* + * old daemon is PF_EXITING, put it now + * + * It could be NULL in case of closing one quisced device. + */ + if (ubq->ubq_daemon) + put_task_struct(ubq->ubq_daemon); + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ + ubq->ubq_daemon = NULL; + ubq->timeout = false; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* + * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch + * io->cmd + */ + io->flags &= UBLK_IO_FLAG_CANCELED; + io->cmd = NULL; + io->addr = 0; + } +} + static int ublk_ch_open(struct inode *inode, struct file *filp) { struct ublk_device *ub = container_of(inode->i_cdev, @@ -1481,10 +1512,26 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) return 0; } +static void ublk_reset_ch_dev(struct ublk_device *ub) +{ + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + ub->mm = NULL; + ub->nr_queues_ready = 0; + ub->nr_privileged_daemon = 0; +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; + /* all uring_cmd has been done now, reset device & ubq */ + ublk_reset_ch_dev(ub); + clear_bit(UB_STATE_OPEN, &ub->state); return 0; } @@ -1831,6 +1878,24 @@ static void ublk_nosrv_work(struct work_struct *work) ublk_cancel_dev(ub); } +/* reset ublk io_uring queue & io flags */ +static void ublk_reset_io_flags(struct ublk_device *ub) +{ + int i, j; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + ubq->canceling = false; + ubq->fail_io = false; + } +} + /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) __must_hold(&ub->mutex) @@ -1844,8 +1909,12 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) if (capable(CAP_SYS_ADMIN)) ub->nr_privileged_daemon++; } - if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) + + if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { + /* now we are ready for handling ublk io request */ + ublk_reset_io_flags(ub); complete_all(&ub->completion); + } } static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, @@ -2954,41 +3023,14 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, return ret; } -static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) -{ - int i; - - WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); - - /* All old ioucmds have to be completed */ - ubq->nr_io_ready = 0; - /* old daemon is PF_EXITING, put it now */ - put_task_struct(ubq->ubq_daemon); - /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ - ubq->ubq_daemon = NULL; - ubq->timeout = false; - - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; - - /* forget everything now and be ready for new FETCH_REQ */ - io->flags = 0; - io->cmd = NULL; - io->addr = 0; - } -} - static int ublk_ctrl_start_recovery(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) { int ret = -EINVAL; - int i; mutex_lock(&ub->mutex); if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (!ub->nr_queues_ready) - goto out_unlock; /* * START_RECOVERY is only allowd after: * @@ -3012,12 +3054,6 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, goto out_unlock; } pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_queue_reinit(ub, ublk_get_queue(ub, i)); - /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ - ub->mm = NULL; - ub->nr_queues_ready = 0; - ub->nr_privileged_daemon = 0; init_completion(&ub->completion); ret = 0; out_unlock: @@ -3030,7 +3066,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, { int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; - int i; pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -3050,22 +3085,10 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->dev_info.state = UBLK_S_DEV_LIVE; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - - blk_mq_quiesce_queue(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = false; - ubq->fail_io = false; - } - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); blk_mq_kick_requeue_list(ub->ub_disk->queue); - ret = 0; out_unlock: mutex_unlock(&ub->mutex); -- cgit v1.2.3 From 82a8a30c581bbbe653d33c6ce2ef67e3072c7f12 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Wed, 16 Apr 2025 11:54:39 +0800 Subject: ublk: improve detection and handling of ublk server exit There are currently two ways in which ublk server exit is detected by ublk_drv: 1. uring_cmd cancellation. If there are any outstanding uring_cmds which have not been completed to the ublk server when it exits, io_uring calls the uring_cmd callback with a special cancellation flag as the issuing task is exiting. 2. I/O timeout. This is needed in addition to the above to handle the "saturated queue" case, when all I/Os for a given queue are in the ublk server, and therefore there are no outstanding uring_cmds to cancel when the ublk server exits. There are a couple of issues with this approach: - It is complex and inelegant to have two methods to detect the same condition - The second method detects ublk server exit only after a long delay (~30s, the default timeout assigned by the block layer). This delays the nosrv behavior from kicking in and potential subsequent recovery of the device. The second issue is brought to light with the new test_generic_06 which will be added in following patch. It fails before this fix: selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 30.0611 s, 0.0 kB/s DEAD dd took 31 seconds to exit (>= 5s tolerance)! generic_06 : [FAIL] Fix this by instead detecting and handling ublk server exit in the character file release callback. This has several advantages: - This one place can handle both saturated and unsaturated queues. Thus, it replaces both preexisting methods of detecting ublk server exit. - It runs quickly on ublk server exit - there is no 30s delay. - It starts the process of removing task references in ublk_drv. This is needed if we want to relax restrictions in the driver like letting only one thread serve each queue There is also the disadvantage that the character file release callback can also be triggered by intentional close of the file, which is a significant behavior change. Preexisting ublk servers (libublksrv) are dependent on the ability to open/close the file multiple times. To address this, only transition to a nosrv state if the file is released while the ublk device is live. This allows for programs to open/close the file multiple times during setup. It is still a behavior change if a ublk server decides to close/reopen the file while the device is LIVE (i.e. while it is responsible for serving I/O), but that would be highly unusual. This behavior is in line with what is done by FUSE, which is very similar to ublk in that a userspace daemon is providing services traditionally provided by the kernel. With this change in, the new test (and all other selftests, and all ublksrv tests) pass: selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 0.0376731 s, 0.0 kB/s DEAD generic_04 : [PASS] Signed-off-by: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 223 ++++++++++++++++++++++++++--------------------- 1 file changed, 123 insertions(+), 100 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1fe39cf85b2f..b0a7e5acb2eb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -199,8 +199,6 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - - struct work_struct nosrv_work; }; /* header of ublk_params */ @@ -209,8 +207,9 @@ struct ublk_params_header { __u32 types; }; -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); - +static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); +static void __ublk_quiesce_dev(struct ublk_device *ub); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); @@ -1336,8 +1335,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; - unsigned int nr_inflight = 0; - int i; if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { if (!ubq->timeout) { @@ -1348,26 +1345,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_DONE; } - if (!ubq_daemon_is_dying(ubq)) - return BLK_EH_RESET_TIMER; - - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; - - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) - nr_inflight++; - } - - /* cancelable uring_cmd can't help us if all commands are in-flight */ - if (nr_inflight == ubq->q_depth) { - struct ublk_device *ub = ubq->dev; - - if (ublk_abort_requests(ub, ubq)) { - schedule_work(&ub->nosrv_work); - } - return BLK_EH_DONE; - } - return BLK_EH_RESET_TIMER; } @@ -1525,13 +1502,105 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) ub->nr_privileged_daemon = 0; } +static struct gendisk *ublk_get_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + spin_lock(&ub->lock); + disk = ub->ub_disk; + if (disk) + get_device(disk_to_dev(disk)); + spin_unlock(&ub->lock); + + return disk; +} + +static void ublk_put_disk(struct gendisk *disk) +{ + if (disk) + put_device(disk_to_dev(disk)); +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; + struct gendisk *disk; + int i; + + /* + * disk isn't attached yet, either device isn't live, or it has + * been removed already, so we needn't to do anything + */ + disk = ublk_get_disk(ub); + if (!disk) + goto out; + + /* + * All uring_cmd are done now, so abort any request outstanding to + * the ublk server + * + * This can be done in lockless way because ublk server has been + * gone + * + * More importantly, we have to provide forward progress guarantee + * without holding ub->mutex, otherwise control task grabbing + * ub->mutex triggers deadlock + * + * All requests may be inflight, so ->canceling may not be set, set + * it now. + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + ubq->canceling = true; + ublk_abort_queue(ub, ubq); + } + blk_mq_kick_requeue_list(disk->queue); + + /* + * All infligh requests have been completed or requeued and any new + * request will be failed or requeued via `->canceling` now, so it is + * fine to grab ub->mutex now. + */ + mutex_lock(&ub->mutex); + + /* double check after grabbing lock */ + if (!ub->ub_disk) + goto unlock; + + /* + * Transition the device to the nosrv state. What exactly this + * means depends on the recovery flags + */ + blk_mq_quiesce_queue(disk->queue); + if (ublk_nosrv_should_stop_dev(ub)) { + /* + * Allow any pending/future I/O to pass through quickly + * with an error. This is needed because del_gendisk + * waits for all pending I/O to complete + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + blk_mq_unquiesce_queue(disk->queue); + + ublk_stop_dev_unlocked(ub); + } else { + if (ublk_nosrv_dev_should_queue_io(ub)) { + __ublk_quiesce_dev(ub); + } else { + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->fail_io = true; + } + blk_mq_unquiesce_queue(disk->queue); + } +unlock: + mutex_unlock(&ub->mutex); + ublk_put_disk(disk); /* all uring_cmd has been done now, reset device & ubq */ ublk_reset_ch_dev(ub); - +out: clear_bit(UB_STATE_OPEN, &ub->state); return 0; } @@ -1627,37 +1696,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) } /* Must be called when queue is frozen */ -static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) +static void ublk_mark_queue_canceling(struct ublk_queue *ubq) { - bool canceled; - spin_lock(&ubq->cancel_lock); - canceled = ubq->canceling; - if (!canceled) + if (!ubq->canceling) ubq->canceling = true; spin_unlock(&ubq->cancel_lock); - - return canceled; } -static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +static void ublk_start_cancel(struct ublk_queue *ubq) { - bool was_canceled = ubq->canceling; - struct gendisk *disk; - - if (was_canceled) - return false; - - spin_lock(&ub->lock); - disk = ub->ub_disk; - if (disk) - get_device(disk_to_dev(disk)); - spin_unlock(&ub->lock); + struct ublk_device *ub = ubq->dev; + struct gendisk *disk = ublk_get_disk(ub); /* Our disk has been dead */ if (!disk) - return false; - + return; /* * Now we are serialized with ublk_queue_rq() * @@ -1666,15 +1720,9 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) * touch completed uring_cmd */ blk_mq_quiesce_queue(disk->queue); - was_canceled = ublk_mark_queue_canceling(ubq); - if (!was_canceled) { - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); - } + ublk_mark_queue_canceling(ubq); blk_mq_unquiesce_queue(disk->queue); - put_device(disk_to_dev(disk)); - - return !was_canceled; + ublk_put_disk(disk); } static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, @@ -1698,6 +1746,17 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live + * + * Two-stage cancel: + * + * - make every active uring_cmd done in ->cancel_fn() + * + * - aborting inflight ublk IO requests in ublk char device release handler, + * which depends on 1st stage because device can only be closed iff all + * uring_cmd are done + * + * Do _not_ try to acquire ub->mutex before all inflight requests are + * aborted, otherwise deadlock may be caused. */ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -1705,8 +1764,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; - struct ublk_device *ub; - bool need_schedule; struct ublk_io *io; if (WARN_ON_ONCE(!ubq)) @@ -1719,16 +1776,12 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) return; - ub = ubq->dev; - need_schedule = ublk_abort_requests(ub, ubq); + if (!ubq->canceling) + ublk_start_cancel(ubq); io = &ubq->ios[pdu->tag]; WARN_ON_ONCE(io->cmd != cmd); ublk_cancel_cmd(ubq, io, issue_flags); - - if (need_schedule) { - schedule_work(&ub->nosrv_work); - } } static inline bool ublk_queue_ready(struct ublk_queue *ubq) @@ -1787,13 +1840,11 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) __func__, ub->dev_info.dev_id, ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED"); - blk_mq_quiesce_queue(ub->ub_disk->queue); /* mark every queue as canceling */ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_get_queue(ub, i)->canceling = true; ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; - blk_mq_unquiesce_queue(ub->ub_disk->queue); } static void ublk_force_abort_dev(struct ublk_device *ub) @@ -1830,50 +1881,25 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub) return disk; } -static void ublk_stop_dev(struct ublk_device *ub) +static void ublk_stop_dev_unlocked(struct ublk_device *ub) + __must_hold(&ub->mutex) { struct gendisk *disk; - mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) - goto unlock; + return; + if (ublk_nosrv_dev_should_queue_io(ub)) ublk_force_abort_dev(ub); del_gendisk(ub->ub_disk); disk = ublk_detach_disk(ub); put_disk(disk); - unlock: - mutex_unlock(&ub->mutex); - ublk_cancel_dev(ub); } -static void ublk_nosrv_work(struct work_struct *work) +static void ublk_stop_dev(struct ublk_device *ub) { - struct ublk_device *ub = - container_of(work, struct ublk_device, nosrv_work); - int i; - - if (ublk_nosrv_should_stop_dev(ub)) { - ublk_stop_dev(ub); - return; - } - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - - if (ublk_nosrv_dev_should_queue_io(ub)) { - __ublk_quiesce_dev(ub); - } else { - blk_mq_quiesce_queue(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_FAIL_IO; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - ublk_get_queue(ub, i)->fail_io = true; - } - blk_mq_unquiesce_queue(ub->ub_disk->queue); - } - - unlock: + ublk_stop_dev_unlocked(ub); mutex_unlock(&ub->mutex); ublk_cancel_dev(ub); } @@ -2502,7 +2528,6 @@ static void ublk_remove(struct ublk_device *ub) bool unprivileged; ublk_stop_dev(ub); - cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; ublk_put_device(ub); @@ -2787,7 +2812,6 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); - INIT_WORK(&ub->nosrv_work, ublk_nosrv_work); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2919,7 +2943,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) static int ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->nosrv_work); return 0; } -- cgit v1.2.3 From 736b005b413a172670711ee17cab3c8ccab83223 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Apr 2025 11:54:40 +0800 Subject: ublk: remove __ublk_quiesce_dev() Remove __ublk_quiesce_dev() and open code for updating device state as QUIESCED. We needn't to drain inflight requests in __ublk_quiesce_dev() any more, because all inflight requests are aborted in ublk char device release handler. Also we needn't to set ->canceling in __ublk_quiesce_dev() any more because it is done unconditionally now in ublk_ch_release(). Reviewed-by: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b0a7e5acb2eb..bf708e9e9a12 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -209,7 +209,6 @@ struct ublk_params_header { static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); -static void __ublk_quiesce_dev(struct ublk_device *ub); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); @@ -1586,7 +1585,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) ublk_stop_dev_unlocked(ub); } else { if (ublk_nosrv_dev_should_queue_io(ub)) { - __ublk_quiesce_dev(ub); + /* ->canceling is set and all requests are aborted */ + ub->dev_info.state = UBLK_S_DEV_QUIESCED; } else { ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) @@ -1832,21 +1832,6 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) } } -static void __ublk_quiesce_dev(struct ublk_device *ub) -{ - int i; - - pr_devel("%s: quiesce ub: dev_id %d state %s\n", - __func__, ub->dev_info.dev_id, - ub->dev_info.state == UBLK_S_DEV_LIVE ? - "LIVE" : "QUIESCED"); - /* mark every queue as canceling */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->canceling = true; - ublk_wait_tagset_rqs_idle(ub); - ub->dev_info.state = UBLK_S_DEV_QUIESCED; -} - static void ublk_force_abort_dev(struct ublk_device *ub) { int i; -- cgit v1.2.3 From e63d2228ef831af36f963b3ab8604160cfff84c1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Apr 2025 11:54:41 +0800 Subject: ublk: simplify aborting ublk request Now ublk_abort_queue() is moved to ublk char device release handler, meantime our request queue is "quiesced" because either ->canceling was set from uring_cmd cancel function or all IOs are inflight and can't be completed by ublk server, things becomes easy much: - all uring_cmd are done, so we needn't to mark io as UBLK_IO_FLAG_ABORTED for handling completion from uring_cmd - ublk char device is closed, no one can hold IO request reference any more, so we can simply complete this request or requeue it for ublk_nosrv_should_reissue_outstanding. Reviewed-by: Uday Shankar Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250416035444.99569-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 82 ++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 62 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bf708e9e9a12..2de7b2bd409d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -122,15 +122,6 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 -/* - * IO command is aborted, so this flag is set in case of - * !UBLK_IO_FLAG_ACTIVE. - * - * After this flag is observed, any pending or new incoming request - * associated with this io command will be failed immediately - */ -#define UBLK_IO_FLAG_ABORTED 0x04 - /* * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires * get data buffer address from ublksrv. @@ -1083,12 +1074,6 @@ static inline void __ublk_complete_rq(struct request *req) unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; - /* called from ublk_abort_queue() code path */ - if (io->flags & UBLK_IO_FLAG_ABORTED) { - res = BLK_STS_IOERR; - goto exit; - } - /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; @@ -1138,47 +1123,6 @@ static void ublk_complete_rq(struct kref *ref) __ublk_complete_rq(req); } -static void ublk_do_fail_rq(struct request *req) -{ - struct ublk_queue *ubq = req->mq_hctx->driver_data; - - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) - blk_mq_requeue_request(req, false); - else - __ublk_complete_rq(req); -} - -static void ublk_fail_rq_fn(struct kref *ref) -{ - struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, - ref); - struct request *req = blk_mq_rq_from_pdu(data); - - ublk_do_fail_rq(req); -} - -/* - * Since ublk_rq_task_work_cb always fails requests immediately during - * exiting, __ublk_fail_req() is only called from abort context during - * exiting. So lock is unnecessary. - * - * Also aborting may not be started yet, keep in mind that one failed - * request may be issued by block layer again. - */ -static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, - struct request *req) -{ - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - kref_put(&data->ref, ublk_fail_rq_fn); - } else { - ublk_do_fail_rq(req); - } -} - static void ubq_complete_io_cmd(struct ublk_io *io, int res, unsigned issue_flags) { @@ -1667,10 +1611,26 @@ static void ublk_commit_completion(struct ublk_device *ub, ublk_put_req_ref(ubq, req); } +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + blk_mq_requeue_request(req, false); + else { + io->res = -EIO; + __ublk_complete_rq(req); + } +} + /* - * Called from ubq_daemon context via cancel fn, meantime quiesce ublk - * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon - * context, so everything is serialized. + * Called from ublk char device release handler, when any uring_cmd is + * done, meantime request queue is "quiesced" since all inflight requests + * can't be completed because ublk server is dead. + * + * So no one can hold our request IO reference any more, simply ignore the + * reference, and complete the request immediately */ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) { @@ -1687,10 +1647,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) * will do it */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq && blk_mq_request_started(rq)) { - io->flags |= UBLK_IO_FLAG_ABORTED; + if (rq && blk_mq_request_started(rq)) __ublk_fail_req(ubq, io, rq); - } } } } -- cgit v1.2.3 From d6aa0c178bf81f30ae4a780b2bca653daa2eb633 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Apr 2025 09:37:39 +0800 Subject: ublk: call ublk_dispatch_req() for handling UBLK_U_IO_NEED_GET_DATA We call io_uring_cmd_complete_in_task() to schedule task_work for handling UBLK_U_IO_NEED_GET_DATA. This way is really not necessary because the current context is exactly the ublk queue context, so call ublk_dispatch_req() directly for handling UBLK_U_IO_NEED_GET_DATA. Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Tested-by: Jared Holzman Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250425013742.1079549-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2de7b2bd409d..c4d4be4f6fbd 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1886,15 +1886,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } } -static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, - int tag) -{ - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - - ublk_queue_cmd(ubq, req); -} - static inline int ublk_check_cmd_op(u32 cmd_op) { u32 ioc_type = _IOC_TYPE(cmd_op); @@ -2103,8 +2094,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); - break; + req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); + ublk_dispatch_req(ubq, req, issue_flags); + return -EIOCBQUEUED; default: goto out; } -- cgit v1.2.3 From f40139fde5278d81af3227444fd6e76a76b9506d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Apr 2025 09:37:40 +0800 Subject: ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd ublk_cancel_cmd() calls io_uring_cmd_done() to complete uring_cmd, but we may have scheduled task work via io_uring_cmd_complete_in_task() for dispatching request, then kernel crash can be triggered. Fix it by not trying to canceling the command if ublk block request is started. Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Reported-by: Jared Holzman Tested-by: Jared Holzman Closes: https://lore.kernel.org/linux-block/d2179120-171b-47ba-b664-23242981ef19@nvidia.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250425013742.1079549-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c4d4be4f6fbd..40f971a66d3e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1683,14 +1683,31 @@ static void ublk_start_cancel(struct ublk_queue *ubq) ublk_put_disk(disk); } -static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, +static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, unsigned int issue_flags) { + struct ublk_io *io = &ubq->ios[tag]; + struct ublk_device *ub = ubq->dev; + struct request *req; bool done; if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) return; + /* + * Don't try to cancel this command if the request is started for + * avoiding race between io_uring_cmd_done() and + * io_uring_cmd_complete_in_task(). + * + * Either the started request will be aborted via __ublk_abort_rq(), + * then this uring_cmd is canceled next time, or it will be done in + * task work function ublk_dispatch_req() because io_uring guarantees + * that ublk_dispatch_req() is always called + */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (req && blk_mq_request_started(req)) + return; + spin_lock(&ubq->cancel_lock); done = !!(io->flags & UBLK_IO_FLAG_CANCELED); if (!done) @@ -1722,7 +1739,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; - struct ublk_io *io; if (WARN_ON_ONCE(!ubq)) return; @@ -1737,9 +1753,8 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (!ubq->canceling) ublk_start_cancel(ubq); - io = &ubq->ios[pdu->tag]; - WARN_ON_ONCE(io->cmd != cmd); - ublk_cancel_cmd(ubq, io, issue_flags); + WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd); + ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } static inline bool ublk_queue_ready(struct ublk_queue *ubq) @@ -1752,7 +1767,7 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) int i; for (i = 0; i < ubq->q_depth; i++) - ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); + ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } /* Cancel all pending commands, must be called after del_gendisk() returns */ -- cgit v1.2.3 From 69edf98be844375807f299397c516fb1e962b3cc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 29 Apr 2025 10:29:37 +0800 Subject: ublk: decouple zero copy from user copy UBLK_F_USER_COPY and UBLK_F_SUPPORT_ZERO_COPY are two different features, and shouldn't be coupled together. Commit 1f6540e2aabb ("ublk: zc register/unregister bvec") enables user copy automatically in case of UBLK_F_SUPPORT_ZERO_COPY, this way isn't correct. So decouple zero copy from user copy, and use independent helper to check each one. Fixes: 1f6540e2aabb ("ublk: zc register/unregister bvec") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250429022941.1718671-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 40f971a66d3e..0a3a3c64316d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -205,11 +205,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); -static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); -} - static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_ZONED; @@ -609,14 +604,19 @@ static void ublk_apply_params(struct ublk_device *ub) ublk_dev_param_zoned_apply(ub); } +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; +} + static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); + return ubq->flags & UBLK_F_USER_COPY; } static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { - return !ublk_support_user_copy(ubq); + return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -624,8 +624,11 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) /* * read()/write() is involved in user copy, so request reference * has to be grabbed + * + * for zero copy, request buffer need to be registered to io_uring + * buffer table, so reference is needed */ - return ublk_support_user_copy(ubq); + return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, @@ -2245,6 +2248,9 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (!ubq) return ERR_PTR(-EINVAL); + if (!ublk_support_user_copy(ubq)) + return ERR_PTR(-EACCES); + if (tag >= ubq->q_depth) return ERR_PTR(-EINVAL); @@ -2783,13 +2789,18 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK; - /* GET_DATA isn't needed any more with USER_COPY */ - if (ublk_dev_is_user_copy(ub)) + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ + if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; - /* Zoned storage support requires user copy feature */ + /* + * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for + * returning write_append_lba, which is only allowed in case of + * user copy or zero copy + */ if (ublk_dev_is_zoned(ub) && - (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags & + (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) { ret = -EINVAL; goto out_free_dev_number; } -- cgit v1.2.3 From 6240f43b29f285a40eebeb789756673af7a7d67c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 29 Apr 2025 10:29:38 +0800 Subject: ublk: enhance check for register/unregister io buffer command The simple check of UBLK_IO_FLAG_OWNED_BY_SRV can avoid incorrect register/unregister io buffer easily, so check it before calling starting to register/un-register io buffer. Also only allow io buffer register/unregister uring_cmd in case of UBLK_F_SUPPORT_ZERO_COPY. Also mark argument 'ublk_queue *' of ublk_register_io_buf as const. Reviewed-by: Caleb Sander Mateos Fixes: 1f6540e2aabb ("ublk: zc register/unregister bvec") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250429022941.1718671-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0a3a3c64316d..c624d8f653ae 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -201,7 +201,7 @@ struct ublk_params_header { static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - struct ublk_queue *ubq, int tag, size_t offset); + const struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); @@ -1949,13 +1949,20 @@ static void ublk_io_release(void *priv) } static int ublk_register_io_buf(struct io_uring_cmd *cmd, - struct ublk_queue *ubq, unsigned int tag, + const struct ublk_queue *ubq, unsigned int tag, unsigned int index, unsigned int issue_flags) { struct ublk_device *ub = cmd->file->private_data; + const struct ublk_io *io = &ubq->ios[tag]; struct request *req; int ret; + if (!ublk_support_zero_copy(ubq)) + return -EINVAL; + + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EINVAL; + req = __ublk_check_and_get_req(ub, ubq, tag, 0); if (!req) return -EINVAL; @@ -1971,8 +1978,17 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, } static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublk_queue *ubq, unsigned int tag, unsigned int index, unsigned int issue_flags) { + const struct ublk_io *io = &ubq->ios[tag]; + + if (!ublk_support_zero_copy(ubq)) + return -EINVAL; + + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EINVAL; + return io_buffer_unregister_bvec(cmd, index, issue_flags); } @@ -2076,7 +2092,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, case UBLK_IO_REGISTER_IO_BUF: return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); + return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); case UBLK_IO_FETCH_REQ: ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); if (ret) @@ -2128,7 +2144,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - struct ublk_queue *ubq, int tag, size_t offset) + const struct ublk_queue *ubq, int tag, size_t offset) { struct request *req; -- cgit v1.2.3 From a584b2630b0d31f8a20e4ccb4de370b160177b8a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 29 Apr 2025 10:29:39 +0800 Subject: ublk: remove the check of ublk_need_req_ref() from __ublk_check_and_get_req __ublk_check_and_get_req() is only called from ublk_check_and_get_req() and ublk_register_io_buf(), the same check has been covered in the two calling sites. So remove the check from __ublk_check_and_get_req(). Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250429022941.1718671-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c624d8f653ae..f9032076bc06 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2148,9 +2148,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, { struct request *req; - if (!ublk_need_req_ref(ubq)) - return NULL; - req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (!req) return NULL; -- cgit v1.2.3