diff options
-rw-r--r-- | block/genhd.c | 26 | ||||
-rw-r--r-- | drivers/block/ublk_drv.c | 49 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 87 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/ublk_cmd.h | 32 | ||||
-rwxr-xr-x | tools/testing/selftests/ublk/test_stress_03.sh | 5 |
7 files changed, 125 insertions, 79 deletions
diff --git a/block/genhd.c b/block/genhd.c index 8171a6bc3210..c26733f6324b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -128,23 +128,27 @@ static void part_stat_read_all(struct block_device *part, static void bdev_count_inflight_rw(struct block_device *part, unsigned int inflight[2], bool mq_driver) { + int write = 0; + int read = 0; int cpu; if (mq_driver) { blk_mq_in_driver_rw(part, inflight); - } else { - for_each_possible_cpu(cpu) { - inflight[READ] += part_stat_local_read_cpu( - part, in_flight[READ], cpu); - inflight[WRITE] += part_stat_local_read_cpu( - part, in_flight[WRITE], cpu); - } + return; + } + + for_each_possible_cpu(cpu) { + read += part_stat_local_read_cpu(part, in_flight[READ], cpu); + write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu); } - if (WARN_ON_ONCE((int)inflight[READ] < 0)) - inflight[READ] = 0; - if (WARN_ON_ONCE((int)inflight[WRITE] < 0)) - inflight[WRITE] = 0; + /* + * While iterating all CPUs, some IOs may be issued from a CPU already + * traversed and complete on a CPU that has not yet been traversed, + * causing the inflight number to be negative. + */ + inflight[READ] = read > 0 ? read : 0; + inflight[WRITE] = write > 0 ? write : 0; } /** diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d36f44f5ee80..c3e3c3b65a6d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1148,8 +1148,8 @@ exit: blk_mq_end_request(req, res); } -static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, - int res, unsigned issue_flags) +static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, + struct request *req) { /* read cmd first because req will overwrite it */ struct io_uring_cmd *cmd = io->cmd; @@ -1164,6 +1164,13 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, io->flags &= ~UBLK_IO_FLAG_ACTIVE; io->req = req; + return cmd; +} + +static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, + int res, unsigned issue_flags) +{ + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); /* tell ublksrv one io request is coming */ io_uring_cmd_done(cmd, res, 0, issue_flags); @@ -1416,6 +1423,14 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, + const struct ublk_io *io2) +{ + return (io_uring_cmd_ctx_handle(io->cmd) == + io_uring_cmd_ctx_handle(io2->cmd)) && + (io->task == io2->task); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1427,7 +1442,8 @@ static void ublk_queue_rqs(struct rq_list *rqlist) struct ublk_queue *this_q = req->mq_hctx->driver_data; struct ublk_io *this_io = &this_q->ios[req->tag]; - if (io && io->task != this_io->task && !rq_list_empty(&submit_list)) + if (io && !ublk_belong_to_same_batch(io, this_io) && + !rq_list_empty(&submit_list)) ublk_queue_cmd_list(io, &submit_list); io = this_io; @@ -2148,10 +2164,9 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq, return 0; } -static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io) +static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) { - struct request *req = io->req; - /* * We have handled UBLK_IO_NEED_GET_DATA command, * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just @@ -2178,6 +2193,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; int ret = -EINVAL; + struct request *req; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, @@ -2236,11 +2252,19 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, goto out; break; case UBLK_IO_NEED_GET_DATA: - io->addr = ub_cmd->addr; - if (!ublk_get_data(ubq, io)) - return -EIOCBQUEUED; - - return UBLK_IO_RES_OK; + /* + * ublk_get_data() may fail and fallback to requeue, so keep + * uring_cmd active first and prepare for handling new requeued + * request + */ + req = io->req; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + if (likely(ublk_get_data(ubq, io, req))) { + __ublk_prep_compl_io_cmd(io, req); + return UBLK_IO_RES_OK; + } + break; default: goto out; } @@ -2825,7 +2849,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) return -EINVAL; if (capable(CAP_SYS_ADMIN)) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 92697f98c601..e533d791955d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2015,21 +2015,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } -static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, - struct nvme_id_ns *id, struct queue_limits *lim, - u32 bs, u32 atomic_bs) +static u32 nvme_configure_atomic_write(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, u32 bs) { - unsigned int boundary = 0; + u32 atomic_bs, boundary = 0; - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { - if (le16_to_cpu(id->nabspf)) + /* + * We do not support an offset for the atomic boundaries. + */ + if (id->nabo) + return bs; + + if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { + /* + * Use the per-namespace atomic write unit when available. + */ + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + if (id->nabspf) boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } else { + /* + * Use the controller wide atomic write unit. This sucks + * because the limit is defined in terms of logical blocks while + * namespaces can have different formats, and because there is + * no clear language in the specification prohibiting different + * values for different controllers in the subsystem. + */ + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } + lim->atomic_write_hw_max = atomic_bs; lim->atomic_write_hw_boundary = boundary; lim->atomic_write_hw_unit_min = bs; lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); lim->features |= BLK_FEAT_ATOMIC_WRITES; + return atomic_bs; } static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) @@ -2067,34 +2087,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, valid = false; } - atomic_bs = phys_bs = bs; - if (id->nabo == 0) { - /* - * Bit 1 indicates whether NAWUPF is defined for this namespace - * and whether it should be used instead of AWUPF. If NAWUPF == - * 0 then AWUPF must be used instead. - */ - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; - else - atomic_bs = (1 + ns->ctrl->awupf) * bs; - - /* - * Set subsystem atomic bs. - */ - if (ns->ctrl->subsys->atomic_bs) { - if (atomic_bs != ns->ctrl->subsys->atomic_bs) { - dev_err_ratelimited(ns->ctrl->device, - "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", - ns->disk ? ns->disk->disk_name : "?", - ns->ctrl->subsys->atomic_bs, - atomic_bs); - } - } else - ns->ctrl->subsys->atomic_bs = atomic_bs; - - nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); - } + phys_bs = bs; + atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ @@ -2382,16 +2376,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; - /* - * Validate the max atomic write size fits within the subsystem's - * atomic write capabilities. - */ - if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) { - blk_mq_unfreeze_queue(ns->disk->queue, memflags); - ret = -ENXIO; - goto out; - } - nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) @@ -3215,6 +3199,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->model, id->mn, sizeof(subsys->model)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); /* Versions prior to 1.4 don't necessarily report a valid type */ if (id->cntrltype == NVME_CTRL_DISC || @@ -3552,6 +3537,15 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret) goto out_free; } + + if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) { + dev_err_ratelimited(ctrl->device, + "inconsistent AWUPF, controller not added (%u/%u).\n", + le16_to_cpu(id->awupf), ctrl->subsys->awupf); + ret = -EINVAL; + goto out_free; + } + memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev)); @@ -3647,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); - ctrl->awupf = le16_to_cpu(id->awupf); out_free: kfree(id); return ret; @@ -4036,6 +4029,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) list_add_tail_rcu(&ns->siblings, &head->list); ns->head = head; mutex_unlock(&ctrl->subsys->lock); + +#ifdef CONFIG_NVME_MULTIPATH + cancel_delayed_work(&head->remove_work); +#endif return 0; out_put_ns_head: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e040e467f9fa..316a269842fa 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1311,7 +1311,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) */ if (!try_module_get(THIS_MODULE)) goto out; - queue_delayed_work(nvme_wq, &head->remove_work, + mod_delayed_work(nvme_wq, &head->remove_work, head->delayed_removal_secs * HZ); } else { list_del_init(&head->entry); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a468cdc5b5cb..7df2ea21851f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -410,7 +410,6 @@ struct nvme_ctrl { enum nvme_ctrl_type cntrltype; enum nvme_dctype dctype; - u16 awupf; /* 0's based value. */ }; static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) @@ -443,11 +442,11 @@ struct nvme_subsystem { u8 cmic; enum nvme_subsys_type subtype; u16 vendor_id; + u16 awupf; /* 0's based value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; #endif - u32 atomic_bs; }; /* diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 77d9d6af46da..c9751bdfd937 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -135,8 +135,28 @@ #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) /* - * zero copy requires 4k block size, and can remap ublk driver's io - * request into ublksrv's vm space + * ublk server can register data buffers for incoming I/O requests with a sparse + * io_uring buffer table. The request buffer can then be used as the data buffer + * for io_uring operations via the fixed buffer index. + * Note that the ublk server can never directly access the request data memory. + * + * To use this feature, the ublk server must first register a sparse buffer + * table on an io_uring instance. + * When an incoming ublk request is received, the ublk server submits a + * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The + * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register + * and addr is the index in the io_uring's buffer table to install the buffer. + * SQEs can now be submitted to the io_uring to read/write the request's buffer + * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or + * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index. + * Once the last io_uring operation using the request's buffer has completed, + * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag, + * and addr again specifying the request buffer to unregister. + * The ublk request is completed when its buffer is unregistered from all + * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ. + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak + * uninitialized kernel memory by not reading into the full request buffer. */ #define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) @@ -450,10 +470,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg( __u64 sqe_addr) { struct ublk_auto_buf_reg reg = { - .index = sqe_addr & 0xffff, - .flags = (sqe_addr >> 16) & 0xff, - .reserved0 = (sqe_addr >> 24) & 0xff, - .reserved1 = sqe_addr >> 32, + .index = (__u16)sqe_addr, + .flags = (__u8)(sqe_addr >> 16), + .reserved0 = (__u8)(sqe_addr >> 24), + .reserved1 = (__u32)(sqe_addr >> 32), }; return reg; diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 6eef282d569f..3ed4c9b2d8c0 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -32,22 +32,23 @@ _create_backfile 2 128M ublk_io_and_remove 8G -t null -q 4 -z & ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait if _have_feature "AUTO_BUF_REG"; then ublk_io_and_remove 8G -t null -q 4 --auto_zc & ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback & + wait fi -wait if _have_feature "PER_IO_DAEMON"; then ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks & ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" & ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks & + wait fi -wait _cleanup_test "stress" _show_result $TID $ERR_CODE |