From bd32923e5f02fa7b04d487ec265dc8080d27a257 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:18:02 +0100 Subject: io_uring: don't store bgid in req->buf_index Pass buffer group id into the rest of helpers via struct buf_sel_arg and remove all reassignments of req->buf_index back to bgid. Now, it only stores buffer indexes, and the group is provided by callers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3ea9fa08113ecb4d9224b943e7806e80a324bdf9.1743437358.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/0c01d76ff12986c2f48614db8610caff8f78c869.1743500909.git.asml.silence@gmail.com/ [axboe: fold in patch from second link] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b44d201520d8..3b467879bca8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -653,8 +653,7 @@ struct io_kiocb { u8 iopoll_completed; /* * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. + * For the latter, it points to the selected buffer ID. */ u16 buf_index; -- cgit v1.2.3 From 53db8a71ecb42c2ec5e9c6925269a750255f9af5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Apr 2025 14:50:59 -0600 Subject: io_uring: add support for IORING_OP_PIPE This works just like pipe2(2), except it also supports fixed file descriptors. Used in a similar fashion as for other fd instantiating opcodes (like accept, socket, open, etc), where sqe->file_slot is set appropriately if two direct descriptors are desired rather than a set of normal file descriptors. sqe->addr must be set to a pointer to an array of 2 integers, which is where the fixed/normal file descriptors are copied to. sqe->pipe_flags contains flags, same as what is allowed for pipe2(2). Future expansion of per-op private flags can go in sqe->ioprio, like we do for other opcodes that take both a "syscall" flag set and an io_uring opcode specific flag set. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 + io_uring/opdef.c | 7 +++ io_uring/openclose.c | 133 ++++++++++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 3 + 4 files changed, 145 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8f1fc12bac46..130f3bc71a69 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -73,6 +73,7 @@ struct io_uring_sqe { __u32 futex_flags; __u32 install_fd_flags; __u32 nop_flags; + __u32 pipe_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -283,6 +284,7 @@ enum io_uring_op { IORING_OP_EPOLL_WAIT, IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, + IORING_OP_PIPE, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 489384c0438b..db36433c2294 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_prep_writev_fixed, .issue = io_write, }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca..4dd461163457 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -302,3 +304,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = ret; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (slot != IORING_FILE_INDEX_ALLOC) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = ret; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + files[0]->f_mode |= FMODE_NOWAIT; + files[1]->f_mode |= FMODE_NOWAIT; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_OK; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0ad..4ca2a9935abc 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From 632b3186726984319e2337987de86a442407f30e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:19 +0100 Subject: io_uring/zcrx: move zcrx region to struct io_zcrx_ifq Refill queue region is a part of zcrx and should stay in struct io_zcrx_ifq. We can't have multiple queues without it, so move it there. As a result there is no context global zcrx region anymore, and the region is looked up together with its ifq. To protect a concurrent mmap from seeing an inconsistent region we were protecting changes to ->zcrx_region with mmap_lock, but now it protect the publishing of the ifq. Reviewed-by: David Wei Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/24f1a728fc03d0166f16d099575457e10d9d90f2.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 -- io_uring/zcrx.c | 20 ++++++++++++-------- io_uring/zcrx.h | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3b467879bca8..06d722289fc5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -448,8 +448,6 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; - /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ - struct io_mapped_region zcrx_region; }; /* diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b7e65ae413b6..033284b695c7 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -167,12 +167,11 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, - IORING_MAP_OFF_ZCRX_REGION); + ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; @@ -180,7 +179,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + if (WARN_ON_ONCE(ifq->ctx->ifq)) + return; + + io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -343,9 +345,9 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->mmap_lock); - if (id != 0) + if (id != 0 || !ctx->ifq) return NULL; - return &ctx->zcrx_region; + return &ctx->ifq->region; } int io_register_zcrx_ifq(struct io_ring_ctx *ctx, @@ -433,7 +435,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - ctx->ifq = ifq; + scoped_guard(mutex, &ctx->mmap_lock) + ctx->ifq = ifq; return 0; err: io_zcrx_ifq_free(ifq); @@ -449,7 +452,8 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) if (!ifq) return; - ctx->ifq = NULL; + scoped_guard(mutex, &ctx->mmap_lock) + ctx->ifq = NULL; io_zcrx_ifq_free(ifq); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 43134e5c9213..e3c7c4e647f1 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -39,6 +39,7 @@ struct io_zcrx_ifq { netdevice_tracker netdev_tracker; spinlock_t lock; struct mutex dma_lock; + struct io_mapped_region region; }; #if defined(CONFIG_IO_URING_ZCRX) -- cgit v1.2.3 From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:20 +0100 Subject: io_uring/zcrx: add support for multiple ifqs Allow the user to register multiple ifqs / zcrx contexts. With that we can use multiple interfaces / interface queues in a single io_uring instance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com [axboe: fold in fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++- io_uring/io_uring.c | 3 +- io_uring/net.c | 5 ++- io_uring/zcrx.c | 73 +++++++++++++++++++++++++++++------------- 4 files changed, 56 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 06d722289fc5..7e23e993280e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,8 +40,6 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; -struct io_zcrx_ifq; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -394,7 +392,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct io_zcrx_ifq *ifq; + /* Stores zcrx object pointers of type struct io_zcrx_ifq */ + struct xarray zcrx_ctxs; u32 pers_next; struct xarray personalities; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75c022526548..0dc6c2f1295e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -2889,7 +2890,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (ctx->ifq) { + if (!xa_empty(&ctx->zcrx_ctxs)) { mutex_lock(&ctx->uring_lock); io_shutdown_zcrx_ifqs(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/net.c b/io_uring/net.c index 782f8e76c5c7..b3a643675ce8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1189,11 +1189,10 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); - if (ifq_idx != 0) - return -EINVAL; - zc->ifq = req->ctx->ifq; + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 033284b695c7..22f420d6fbb9 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -156,8 +156,10 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -167,7 +169,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -179,9 +184,6 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - if (WARN_ON_ONCE(ifq->ctx->ifq)) - return; - io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; @@ -343,11 +345,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + lockdep_assert_held(&ctx->mmap_lock); - if (id != 0 || !ctx->ifq) - return NULL; - return &ctx->ifq->region; + return ifq ? &ifq->region : NULL; } int io_register_zcrx_ifq(struct io_ring_ctx *ctx, @@ -359,6 +361,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -371,8 +374,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) @@ -396,7 +397,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!ifq) return -ENOMEM; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } + + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; @@ -428,6 +436,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -435,26 +451,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; + unsigned long id; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } - scoped_guard(mutex, &ctx->mmap_lock) - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -511,12 +535,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) -- cgit v1.2.3 From a5c98e9424573649e59988199a3356a79c9e1fd9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 May 2025 13:17:18 +0100 Subject: io_uring/zcrx: dmabuf backed zerocopy receive Add support for dmabuf backed zcrx areas. To use it, the user should pass IORING_ZCRX_AREA_DMABUF in the struct io_uring_zcrx_area_reg flags field and pass a dmabuf fd in the dmabuf_fd field. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20bb1890e60a82ec945ab36370d1fd54be414ab6.1746097431.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/6e37db97303212bbd8955f9501cf99b579f8aece.1746547722.git.asml.silence@gmail.com [axboe: fold in fixup] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +- io_uring/zcrx.c | 163 +++++++++++++++++++++++++++++++++++++----- io_uring/zcrx.h | 7 ++ 3 files changed, 159 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 130f3bc71a69..5ce096090b0c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -990,12 +990,16 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + struct io_uring_zcrx_area_reg { __u64 addr; __u64 len; __u64 rq_area_token; __u32 flags; - __u32 __resv1; + __u32 dmabuf_fd; __u64 __resv2[2]; }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 34b09beba992..9a568d049204 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -47,30 +47,118 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) return area->mem.pages[net_iov_idx(niov)]; } -static void io_release_area_mem(struct io_zcrx_mem *mem) +static void io_release_dmabuf(struct io_zcrx_mem *mem) { - if (mem->pages) { - unpin_user_pages(mem->pages, mem->nr_folios); - kvfree(mem->pages); + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) + return -EINVAL; + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; } -static int io_import_area(struct io_zcrx_ifq *ifq, +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_mem *mem, struct io_uring_zcrx_area_reg *area_reg) { struct page **pages; int nr_pages; - int ret; - ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); - if (ret) - return ret; + if (area_reg->dmabuf_fd) + return -EINVAL; if (!area_reg->addr) return -EFAULT; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) - return -EINVAL; - pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, &nr_pages); if (IS_ERR(pages)) @@ -82,6 +170,35 @@ static int io_import_area(struct io_zcrx_ifq *ifq, return 0; } +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area, int nr_mapped) { @@ -101,7 +218,10 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, { int i; - io_zcrx_unmap_umem(ifq, area, nr_mapped); + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); for (i = 0; i < area->nia.num_niovs; i++) net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); @@ -145,7 +265,11 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) if (area->is_mapped) return 0; - nr = io_zcrx_map_area_umem(ifq, area); + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); + if (nr != area->nia.num_niovs) { __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; @@ -251,6 +375,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) @@ -259,9 +385,11 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, unsigned nr_iovs; int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; ret = -ENOMEM; @@ -819,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { size_t copy_size = min_t(size_t, PAGE_SIZE, len); const int dst_off = 0; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 9c22807af807..2f5e26389f22 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,15 +3,22 @@ #define IOU_ZC_RX_H #include +#include #include #include #include struct io_zcrx_mem { unsigned long size; + bool is_dmabuf; struct page **pages; unsigned long nr_folios; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; }; struct io_zcrx_area { -- cgit v1.2.3 From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:53 +0100 Subject: io_uring: count allocated requests Keep track of the number requests a ring currently has allocated (and not freed), it'll be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7e23e993280e..73b289b48280 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -435,6 +435,7 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + unsigned nr_req_allocated; /* * Protection for resize vs mmap races - both the mmap and resize diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6efecb46c828..714b66ab34b0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -957,6 +957,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; @@ -2694,8 +2696,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); + } mutex_unlock(&ctx->uring_lock); } @@ -2732,6 +2736,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); -- cgit v1.2.3 From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:54 +0100 Subject: io_uring: drain based on allocates reqs Don't rely on CQ sequence numbers for draining, as it has become messy and needs cq_extra adjustments. Instead, base it on the number of allocated requests and only allow flushing when all requests are in the drain list. As a result, cq_extra is gone, no overhead for its accounting in aux cqe posting, less bloating as it was inlined before, and it's in general simpler than trying to track where we should bump it and where it should be put back like in cases of overflow. Also, it'll likely help with cleaning and unifying some of the CQ posting helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com [axboe: fold in fix from link2] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 79 +++++++++++++++++------------------------- io_uring/io_uring.h | 3 +- 3 files changed, 34 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73b289b48280..00dbd7cd0e7d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -341,7 +341,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - unsigned cq_extra; void *cq_wait_arg; size_t cq_wait_size; @@ -417,6 +416,7 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + unsigned nr_drained; struct io_alloc_cache msg_cache; spinlock_t msg_lock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 714b66ab34b0..9a9b8d35349b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq) +static unsigned io_linked_nr(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *tmp; + unsigned nr = 0; - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; + io_for_each_link(tmp, req) + nr++; + return nr; } -static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { bool drain_seen = false, first = true; + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); drain_seen |= de->req->flags & REQ_F_IO_DRAIN; - if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq)) - break; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); first = false; } } -static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) -{ - guard(spinlock)(&ctx->completion_lock); - __io_queue_deferred(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, true); } @@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - ctx->cq_extra++; - if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { @@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req) io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = io_get_sequence(req); - scoped_guard(spinlock, &ctx->completion_lock) { - list_add_tail(&de->list, &ctx->defer_list); - __io_queue_deferred(ctx); - if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; - } + ctx->nr_drained += io_linked_nr(req); + list_add_tail(&de->list, &ctx->defer_list); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); @@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); } - mutex_unlock(&ctx->uring_lock); +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e4050b2d0821..81f22196a57d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, { io_lockdep_assert_cq_locked(ctx); - ctx->cq_extra++; ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret); } @@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -- cgit v1.2.3 From c80bdb1c55719cd6308d648a7920272a3be09e34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 13:16:44 -0600 Subject: io_uring: pass in struct io_big_cqe to io_alloc_ocqe() Rather than pass extra1/extra2 separately, just pass in the (now) named io_big_cqe struct instead. The callers that don't use/support CQE32 will now just pass a single NULL, rather than two seperate mystery zero values. Move the clearing of the big_cqe elements into io_alloc_ocqe() as well, so it can get moved out of the generic code. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00dbd7cd0e7d..2922635986f5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -710,7 +710,7 @@ struct io_kiocb { const struct cred *creds; struct io_wq_work work; - struct { + struct io_big_cqe { u64 extra1; u64 extra2; } big_cqe; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 02d597716467..4081ffd890af 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx, } static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, - struct io_cqe *cqe, u64 extra1, - u64 extra2, gfp_t gfp) + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -734,17 +734,19 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT); + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); if (ocqe) { ocqe->cqe.user_data = cqe->user_data; ocqe->cqe.res = cqe->res; ocqe->cqe.flags = cqe->flags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; } } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; return ocqe; } @@ -821,7 +823,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC); + ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC); filled = io_cqring_add_overflow(ctx, ocqe); } io_cq_unlock_post(ctx); @@ -841,7 +843,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) struct io_overflow_cqe *ocqe; struct io_cqe cqe = io_init_cqe(user_data, res, cflags); - ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL); + ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL); spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); spin_unlock(&ctx->completion_lock); @@ -1451,8 +1453,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC; struct io_overflow_cqe *ocqe; - ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1, - req->big_cqe.extra2, gfp); + ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp); if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_add_overflow(ctx, ocqe); @@ -1460,8 +1461,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } else { io_cqring_add_overflow(ctx, ocqe); } - - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } } __io_cq_unlock_post(ctx); -- cgit v1.2.3 From 28be240c763a44932bfe573f09e145d182e52609 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 22 May 2025 09:04:50 -0600 Subject: trace/io_uring: fix io_uring_local_work_run ctx documentation The comment for the tracepoint io_uring_local_work_run refers to a field "tctx" and a type "io_uring_ctx", neither of which exist. "tctx" looks to mean "ctx" and "io_uring_ctx" should be "io_ring_ctx". Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250522150451.2385652-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index fb81c533b310..178ab6f611be 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write, /* * io_uring_local_work_run - ran ring local task work * - * @tctx: pointer to a io_uring_ctx + * @ctx: pointer to an io_ring_ctx * @count: how many functions it ran * @loops: how many loops it ran * -- cgit v1.2.3