diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-23 23:06:15 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-23 23:06:15 +0300 |
commit | 9836e93c0a7e031ac6a71c56171c229de1eea7cf (patch) | |
tree | f53f3460e86752c50aac9ee16b4426c84d277899 | |
parent | e1a8fde7203fa8a3e3f35d4f9df47477d23529c1 (diff) | |
parent | 3fe07bcd800d6e5e4e4263ca2564d69095c157bf (diff) | |
download | linux-9836e93c0a7e031ac6a71c56171c229de1eea7cf.tar.xz |
Merge tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block
Pull io_uring NVMe command passthrough from Jens Axboe:
"On top of everything else, this adds support for passthrough for
io_uring.
The initial feature for this is NVMe passthrough support, which allows
non-filesystem based IO commands and admin commands.
To support this, io_uring grows support for SQE and CQE members that
are twice as big, allowing to pass in a full NVMe command without
having to copy data around. And to complete with more than just a
single 32-bit value as the output"
* tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block: (22 commits)
io_uring: cleanup handling of the two task_work lists
nvme: enable uring-passthrough for admin commands
nvme: helper for uring-passthrough checks
blk-mq: fix passthrough plugging
nvme: add vectored-io support for uring-cmd
nvme: wire-up uring-cmd support for io-passthru on char-device.
nvme: refactor nvme_submit_user_cmd()
block: wire-up support for passthrough plugging
fs,io_uring: add infrastructure for uring-cmd
io_uring: support CQE32 for nop operation
io_uring: enable CQE32
io_uring: support CQE32 in /proc info
io_uring: add tracing for additional CQE32 fields
io_uring: overflow processing for CQE32
io_uring: flush completions for CQE32
io_uring: modify io_get_cqe for CQE32
io_uring: add CQE32 completion processing
io_uring: add CQE32 setup processing
io_uring: change ring size calculation for CQE32
io_uring: store add. return values for CQE32
...
-rw-r--r-- | block/blk-mq.c | 109 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 278 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 5 | ||||
-rw-r--r-- | fs/io_uring.c | 444 | ||||
-rw-r--r-- | include/linux/fs.h | 2 | ||||
-rw-r--r-- | include/linux/io_uring.h | 33 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 18 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 24 | ||||
-rw-r--r-- | include/uapi/linux/nvme_ioctl.h | 28 |
11 files changed, 806 insertions, 138 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 84d749511f55..ed1869a305c4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1169,6 +1169,62 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) complete(waiting); } +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + +static void __blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *done, bool use_plug) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io = done; + + blk_account_io_start(rq); + + if (use_plug && current->plug) { + blk_add_rq_to_plug(current->plug, rq); + return; + } + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} + + /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert @@ -1184,18 +1240,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) */ void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) { - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); + __blk_execute_rq_nowait(rq, at_head, done, true); - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -1233,8 +1279,13 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; + /* + * iopoll requires request to be submitted to driver, so can't + * use plug + */ rq->end_io_data = &wait; - blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); + __blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq, + !blk_rq_is_poll(rq)); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; @@ -2676,40 +2727,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -/* - * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple - * queues. This is important for md arrays to benefit from merging - * requests. - */ -static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) -{ - if (plug->multiple_queues) - return BLK_MAX_REQUEST_COUNT * 2; - return BLK_MAX_REQUEST_COUNT; -} - -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - struct request *last = rq_list_peek(&plug->mq_list); - - if (!plug->rq_count) { - trace_block_plug(rq->q); - } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || - (!blk_queue_nomerges(rq->q) && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_mq_flush_plug_list(plug, false); - trace_block_plug(rq->q); - } - - if (!plug->multiple_queues && last && last->q != rq->q) - plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) - plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); - plug->rq_count++; -} - static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e1846d04817f..1a984045e49c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = { .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_dev_uring_cmd, }; static ssize_t nvme_sysfs_reset(struct device *dev, @@ -3699,6 +3700,7 @@ static const struct file_operations nvme_ns_chr_fops = { .release = nvme_ns_chr_release, .unlocked_ioctl = nvme_ns_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_chr_uring_cmd, }; static int nvme_add_ns_cdev(struct nvme_ns *ns) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 554566371ffa..096b1b47d750 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -5,6 +5,7 @@ */ #include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/nvme_ioctl.h> +#include <linux/io_uring.h> #include "nvme.h" /* @@ -53,10 +54,21 @@ out: return ERR_PTR(ret); } -static int nvme_submit_user_cmd(struct request_queue *q, +static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, + void *meta, unsigned len, int ret) +{ + if (!ret && req_op(req) == REQ_OP_DRV_IN && + copy_to_user(ubuf, meta, len)) + ret = -EFAULT; + kfree(meta); + return ret; +} + +static struct request *nvme_alloc_user_request(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout, bool vec) + u32 meta_seed, void **metap, unsigned timeout, bool vec, + unsigned int rq_flags, blk_mq_req_flags_t blk_flags) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -66,9 +78,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0); + req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); if (IS_ERR(req)) - return PTR_ERR(req); + return req; nvme_init_request(req, cmd); if (timeout) @@ -105,26 +117,50 @@ static int nvme_submit_user_cmd(struct request_queue *q, goto out_unmap; } req->cmd_flags |= REQ_INTEGRITY; + *metap = meta; } } + return req; + +out_unmap: + if (bio) + blk_rq_unmap_user(bio); +out: + blk_mq_free_request(req); + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout, bool vec) +{ + struct request *req; + void *meta = NULL; + struct bio *bio; + int ret; + + req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, + meta_len, meta_seed, &meta, timeout, vec, 0, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + bio = req->bio; + ret = nvme_execute_passthru_rq(req); + if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: + if (meta) + ret = nvme_finish_user_metadata(req, meta_buffer, meta, + meta_len, ret); if (bio) blk_rq_unmap_user(bio); - out: blk_mq_free_request(req); return ret; } - static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -296,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return status; } +struct nvme_uring_data { + __u64 metadata; + __u64 addr; + __u32 data_len; + __u32 metadata_len; + __u32 timeout_ms; +}; + +/* + * This overlays struct io_uring_cmd pdu. + * Expect build errors if this grows larger than that. + */ +struct nvme_uring_cmd_pdu { + union { + struct bio *bio; + struct request *req; + }; + void *meta; /* kernel-resident buffer */ + void __user *meta_buffer; + u32 meta_len; +}; + +static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; +} + +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + struct request *req = pdu->req; + struct bio *bio = req->bio; + int status; + u64 result; + + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + status = -EINTR; + else + status = nvme_req(req)->status; + + result = le64_to_cpu(nvme_req(req)->result.u64); + + if (pdu->meta) + status = nvme_finish_user_metadata(req, pdu->meta_buffer, + pdu->meta, pdu->meta_len, status); + if (bio) + blk_rq_unmap_user(bio); + blk_mq_free_request(req); + + io_uring_cmd_done(ioucmd, status, result); +} + +static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + /* extract bio before reusing the same field for request */ + struct bio *bio = pdu->bio; + + pdu->req = req; + req->bio = bio; + /* this takes care of moving rest of completion-work to task context */ + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); +} + +static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) +{ + struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); + const struct nvme_uring_cmd *cmd = ioucmd->cmd; + struct request_queue *q = ns ? ns->queue : ctrl->admin_q; + struct nvme_uring_data d; + struct nvme_command c; + struct request *req; + unsigned int rq_flags = 0; + blk_mq_req_flags_t blk_flags = 0; + void *meta = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + c.common.opcode = READ_ONCE(cmd->opcode); + c.common.flags = READ_ONCE(cmd->flags); + if (c.common.flags) + return -EINVAL; + + c.common.command_id = 0; + c.common.nsid = cpu_to_le32(cmd->nsid); + if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) + return -EINVAL; + + c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); + c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); + c.common.metadata = 0; + c.common.dptr.prp1 = c.common.dptr.prp2 = 0; + c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); + c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); + c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); + c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); + c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); + c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + + d.metadata = READ_ONCE(cmd->metadata); + d.addr = READ_ONCE(cmd->addr); + d.data_len = READ_ONCE(cmd->data_len); + d.metadata_len = READ_ONCE(cmd->metadata_len); + d.timeout_ms = READ_ONCE(cmd->timeout_ms); + + if (issue_flags & IO_URING_F_NONBLOCK) { + rq_flags = REQ_NOWAIT; + blk_flags = BLK_MQ_REQ_NOWAIT; + } + + req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), + d.data_len, nvme_to_user_ptr(d.metadata), + d.metadata_len, 0, &meta, d.timeout_ms ? + msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags, + blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + req->end_io_data = ioucmd; + + /* to free bio on completion, as req->bio will be null at that time */ + pdu->bio = req->bio; + pdu->meta = meta; + pdu->meta_buffer = nvme_to_user_ptr(d.metadata); + pdu->meta_len = d.metadata_len; + + blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io); + return -EIOCBQUEUED; +} + static bool is_ctrl_ioctl(unsigned int cmd) { if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) @@ -387,6 +556,53 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return __nvme_ioctl(ns, cmd, (void __user *)arg); } +static int nvme_uring_cmd_checks(unsigned int issue_flags) +{ + /* IOPOLL not supported yet */ + if (issue_flags & IO_URING_F_IOPOLL) + return -EOPNOTSUPP; + + /* NVMe passthrough requires big SQE/CQE support */ + if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != + (IO_URING_F_SQE128|IO_URING_F_CQE32)) + return -EOPNOTSUPP; + return 0; +} + +static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_IO: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_IO_VEC: + ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, + struct nvme_ns, cdev); + + return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); +} + #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx) @@ -453,8 +669,46 @@ out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; } + +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags) +{ + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EINVAL; + + if (ns) + ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct nvme_ctrl *ctrl = ioucmd->file->private_data; + int ret; + + ret = nvme_uring_cmd_checks(issue_flags); + if (ret) + return ret; + + switch (ioucmd->cmd_op) { + case NVME_URING_CMD_ADMIN: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); + break; + case NVME_URING_CMD_ADMIN_VEC: + ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d464fdf978fb..d3e2440d8abb 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .release = nvme_ns_head_chr_release, .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = nvme_ns_head_chr_uring_cmd, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2b53ca63335..26d35c557588 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -782,7 +782,12 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); +int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, + unsigned int issue_flags); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; diff --git a/fs/io_uring.c b/fs/io_uring.c index 1015dd49e7e5..9f1c682d7caf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -204,13 +204,6 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -222,8 +215,8 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; struct list_head list; + struct io_uring_cqe cqe; }; /* @@ -551,7 +544,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - struct io_wq_work_list prior_task_list; + struct io_wq_work_list prio_task_list; struct callback_head task_work; struct file **registered_rings; bool task_running; @@ -788,6 +781,12 @@ struct io_msg { u32 len; }; +struct io_nop { + struct file *file; + u64 extra1; + u64 extra2; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -992,6 +991,8 @@ struct io_kiocb { struct io_msg msg; struct io_xattr xattr; struct io_socket sock; + struct io_nop nop; + struct io_uring_cmd uring_cmd; }; u8 opcode; @@ -1036,7 +1037,13 @@ struct io_kiocb { atomic_t poll_refs; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ @@ -1070,6 +1077,14 @@ struct io_cancel_data { int seq; }; +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -1311,6 +1326,12 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SOCKET] = { .audit_skip = 1, }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -1450,6 +1471,8 @@ const char *io_uring_get_opcode(u8 opcode) return "GETXATTR"; case IORING_OP_SOCKET: return "SOCKET"; + case IORING_OP_URING_CMD: + return "URING_CMD"; case IORING_OP_LAST: return "INVALID"; } @@ -2119,8 +2142,12 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; unsigned int free, queued, len; + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; @@ -2132,15 +2159,26 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) ctx->cached_cq_tail++; ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; - return ctx->cqe_cached++; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; } static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + ctx->cached_cq_tail++; - return ctx->cqe_cached++; + ctx->cqe_cached++; + return cqe; } + return __io_get_cqe(ctx); } @@ -2212,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed, posted; + size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; + if (ctx->flags & IORING_SETUP_CQE32) + cqe_size <<= 1; + posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { @@ -2227,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, &ocqe->cqe, cqe_size); else io_account_cq_overflow(ctx); @@ -2315,11 +2357,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) + s32 res, u32 cflags, u64 extra1, + u64 extra2) { struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* @@ -2339,6 +2387,10 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2360,7 +2412,7 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, WRITE_ONCE(cqe->flags, cflags); return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags); + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); } static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, @@ -2369,7 +2421,7 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, struct io_uring_cqe *cqe; trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags); + req->cqe.res, req->cqe.flags, 0, 0); /* * If we can't get a cq entry, userspace overflowed the @@ -2382,35 +2434,91 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, return true; } return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags); + req->cqe.res, req->cqe.flags, 0, 0); +} + +static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + u64 extra1 = req->extra1; + u64 extra2 = req->extra2; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + cqe->big_cqe[0] = extra1; + cqe->big_cqe[1] = extra2; + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, extra1, extra2); } static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); } +static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, + u64 extra1, u64 extra2) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return; + if (req->flags & REQ_F_CQE_SKIP) + return; + + trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, + extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + WRITE_ONCE(cqe->user_data, req->cqe.user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return; + } + + io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); +} + static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags); + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); return __io_fill_cqe(ctx, user_data, res, cflags); } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_put(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { + struct io_ring_ctx *ctx = req->ctx; + if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); @@ -2433,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(req, res, cflags); + __io_req_complete_put(req); +} + +static void __io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe32_req(req, res, cflags, extra1, extra2); + __io_req_complete_put(req); +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -2445,6 +2568,18 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, io_cqring_ev_posted(ctx); } +static void io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post32(req, res, cflags, extra1, extra2); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { @@ -2462,6 +2597,19 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } +static inline void __io_req_complete32(struct io_kiocb *req, + unsigned int issue_flags, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_complete_state(req, res, cflags); + req->extra1 = extra1; + req->extra2 = extra2; + } else { + io_req_complete_post32(req, res, cflags, extra1, extra2); + } +} + static inline void io_req_complete(struct io_kiocb *req, s32 res) { if (res < 0) @@ -2803,10 +2951,10 @@ static void tctx_task_work(struct callback_head *cb) struct io_wq_work_node *node1, *node2; spin_lock_irq(&tctx->task_lock); - node1 = tctx->prior_task_list.first; + node1 = tctx->prio_task_list.first; node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); @@ -2820,7 +2968,7 @@ static void tctx_task_work(struct callback_head *cb) cond_resched(); if (data_race(!tctx->task_list.first) && - data_race(!tctx->prior_task_list.first) && uring_locked) + data_race(!tctx->prio_task_list.first) && uring_locked) io_submit_flush_completions(ctx); } @@ -2831,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req, bool priority) +static void __io_req_task_work_add(struct io_kiocb *req, + struct io_uring_task *tctx, + struct io_wq_work_list *list) { - struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - struct io_uring_task *tctx = tsk->io_uring; struct io_wq_work_node *node; unsigned long flags; bool running; - WARN_ON_ONCE(!tctx); - io_drop_inflight_file(req); spin_lock_irqsave(&tctx->task_lock, flags); - if (priority) - wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); - else - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2861,12 +3004,12 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method))) + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); + node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2878,6 +3021,23 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) } } +static void io_req_task_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_task_prio_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + + if (req->ctx->flags & IORING_SETUP_SQPOLL) + __io_req_task_work_add(req, tctx, &tctx->prio_task_list); + else + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + static void io_req_tw_post(struct io_kiocb *req, bool *locked) { io_req_complete_post(req, req->cqe.res, req->cqe.flags); @@ -2888,7 +3048,7 @@ static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.res = res; req->cqe.flags = cflags; req->io_task_work.func = io_req_tw_post; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_cancel(struct io_kiocb *req, bool *locked) @@ -2912,19 +3072,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->cqe.res = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue_reissue(struct io_kiocb *req) { req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_queue_next(struct io_kiocb *req) @@ -2998,8 +3158,12 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req_filled(ctx, req); + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!(ctx->flags & IORING_SETUP_CQE32)) + __io_fill_cqe_req_filled(ctx, req); + else + __io_fill_cqe32_req_filled(ctx, req); + } } io_commit_cqring(ctx); @@ -3328,7 +3492,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; req->cqe.res = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); + io_req_task_prio_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -4462,10 +4626,6 @@ static int __io_getxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio)) - return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4575,10 +4735,6 @@ static int __io_setxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio)) - return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4857,6 +5013,96 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + req->uring_cmd.task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_prio_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + if (ret < 0) + req_set_fail(req); + if (req->ctx->flags & IORING_SETUP_CQE32) + __io_req_complete32(req, 0, ret, 0, res2, 0); + else + io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + + if (sqe->rw_flags) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4992,11 +5238,25 @@ done: return 0; } +static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If the ring is setup with CQE32, relay back addr/addr + */ + if (req->ctx->flags & IORING_SETUP_CQE32) { + req->nop.extra1 = READ_ONCE(sqe->addr); + req->nop.extra2 = READ_ONCE(sqe->addr2); + } + + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { + unsigned int cflags; void __user *buf; if (req->flags & REQ_F_BUFFER_SELECT) { @@ -5007,7 +5267,12 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return -ENOBUFS; } - __io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (!(req->ctx->flags & IORING_SETUP_CQE32)) + __io_req_complete(req, issue_flags, 0, cflags); + else + __io_req_complete32(req, issue_flags, 0, cflags, + req->nop.extra1, req->nop.extra2); return 0; } @@ -6366,9 +6631,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_socket *sock = &req->sock; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index) + if (sqe->addr || sqe->rw_flags || sqe->buf_index) return -EINVAL; sock->domain = READ_ONCE(sqe->fd); @@ -6750,7 +7013,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) req->io_task_work.func = io_apoll_task_func; trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static inline void io_poll_execute(struct io_kiocb *req, int res, @@ -7255,7 +7518,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) req->cqe.res = -ETIME; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -7751,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { switch (req->opcode) { case IORING_OP_NOP: - return 0; + return io_nop_prep(req, sqe); case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: @@ -7835,6 +8098,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_getxattr_prep(req, sqe); case IORING_OP_SOCKET: return io_socket_prep(req, sqe); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -7867,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); @@ -8161,6 +8428,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_SOCKET: ret = io_socket(req, issue_flags); break; + case IORING_OP_URING_CMD: + ret = io_uring_cmd(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -8371,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -8761,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * though the application is the one updating it. */ head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) + if (likely(head < ctx->sq_entries)) { + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; return &ctx->sq_sqes[head]; + } /* drop invalid entries */ ctx->cq_extra--; @@ -10080,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -10258,8 +10532,8 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -10267,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -11833,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; unsigned int sq_entries, cq_entries; bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); unsigned int i; + if (is_cqe32) + cq_shift = 1; + /* * we may get imprecise sqe and cqe info if uring is actively running * since we get cached_sq_head and cached_cq_tail without uring_lock @@ -11869,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } } /* @@ -11976,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -11991,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (p->flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; @@ -12235,7 +12528,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG)) + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -12924,6 +13218,8 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..87b5af1d9fbe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1953,6 +1953,7 @@ struct dir_context { #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; +struct io_uring_cmd; struct file_operations { struct module *owner; @@ -1995,6 +1996,7 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 24651c229ed2..4a2f6cc5a492 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,7 +5,32 @@ #include <linux/sched.h> #include <linux/xarray.h> +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = 4, + IO_URING_F_CQE32 = 8, + IO_URING_F_IOPOLL = 16, +}; + +struct io_uring_cmd { + struct file *file; + const void *cmd; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + u32 cmd_op; + u32 pad; + u8 pdu[32]; /* available inline for free use */ +}; + #if defined(CONFIG_IO_URING) +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); @@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 610e493234db..66fcc5a1a5b1 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -321,13 +321,16 @@ TRACE_EVENT(io_uring_fail_link, * @user_data: user data associated with the request * @res: result of the request * @cflags: completion flags + * @extra1: extra 64-bit data for CQE32 + * @extra2: extra 64-bit data for CQE32 * */ TRACE_EVENT(io_uring_complete, - TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags), + TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags, + u64 extra1, u64 extra2), - TP_ARGS(ctx, req, user_data, res, cflags), + TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2), TP_STRUCT__entry ( __field( void *, ctx ) @@ -335,6 +338,8 @@ TRACE_EVENT(io_uring_complete, __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) + __field( u64, extra1 ) + __field( u64, extra2 ) ), TP_fast_assign( @@ -343,12 +348,17 @@ TRACE_EVENT(io_uring_complete, __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; + __entry->extra1 = extra1; + __entry->extra2 = extra2; ), - TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x", + TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " + "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, - __entry->res, __entry->cflags) + __entry->res, __entry->cflags, + (unsigned long long) __entry->extra1, + (unsigned long long) __entry->extra2) ); /** diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cc9544629eee..53e7dae92e42 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -22,6 +22,7 @@ struct io_uring_sqe { union { __u64 off; /* offset into file */ __u64 addr2; + __u32 cmd_op; }; union { __u64 addr; /* pointer to buffer or iovecs */ @@ -61,8 +62,17 @@ struct io_uring_sqe { __s32 splice_fd_in; __u32 file_index; }; - __u64 addr3; - __u64 __pad2[1]; + union { + struct { + __u64 addr3; + __u64 __pad2[1]; + }; + /* + * If the ring is initialized with IORING_SETUP_SQE128, then + * this field is used for 80 bytes of arbitrary command data + */ + __u8 cmd[0]; + }; }; /* @@ -128,6 +138,9 @@ enum { */ #define IORING_SETUP_TASKRUN_FLAG (1U << 9) +#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ +#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -175,6 +188,7 @@ enum io_uring_op { IORING_OP_FGETXATTR, IORING_OP_GETXATTR, IORING_OP_SOCKET, + IORING_OP_URING_CMD, /* this goes last, obviously */ IORING_OP_LAST, @@ -251,6 +265,12 @@ struct io_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ __s32 res; /* result code for this event */ __u32 flags; + + /* + * If the ring is initialized with IORING_SETUP_CQE32, then this field + * contains 16-bytes of padding, doubling the size of the CQE. + */ + __u64 big_cqe[]; }; /* diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index b2e43185e3b5..2f76cba67166 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 { __u64 result; }; +/* same as struct nvme_passthru_cmd64, minus the 8b result field */ +struct nvme_uring_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 rsvd2; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -83,4 +105,10 @@ struct nvme_passthru_cmd64 { #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) +/* io_uring async commands: */ +#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) +#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) +#define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) +#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) + #endif /* _UAPI_LINUX_NVME_IOCTL_H */ |