From d663976dad68de9b2e3df59cc31f0a24ee4c4511 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:12 +0000 Subject: io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL It'll be annoying and take enough of boilerplate code to implement new zcrx features as separate io_uring register opcode. Introduce IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx. Note, there are no real users of the opcode in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..0e1d353fab1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[8]; +}; + #ifdef __cplusplus } #endif -- cgit v1.2.3 From 475eb39b00478b1898bc9080344dcd8e86c53c7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:13 +0000 Subject: io_uring/zcrx: add sync refill queue flushing Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the kernel to flush / consume entries from the refill queue. Just as with the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address cases where the refill queue becomes full, and the user can't return buffers and needs to stash them. It's still a slow path, and the user should size refill queue appropriately, but it should be helpful for handling temporary traffic spikes and other unpredictable conditions. The interface is simpler comparing to ZCRX_REFILL as it doesn't need temporary refill entry arrays and gives natural batching, whereas ZCRX_REFILL requires even more user logic to be somewhat efficient. Also, add a structure for the operation. It's not currently used but can serve for future improvements like limiting the number of buffers to process, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 +++++- io_uring/zcrx.c | 74 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0e1d353fab1d..db47fced2cc6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,13 +1082,21 @@ struct io_uring_zcrx_ifq_reg { }; enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + __ZCRX_CTRL_LAST, }; +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[8]; + __u64 __resv[2]; + + struct zcrx_ctrl_flush_rq zc_flush; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0b5f4320c7a9..08c103af69bc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -941,6 +941,71 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx) +{ + unsigned int mask = zcrx->rq_entries - 1; + unsigned int i; + + guard(spinlock_bh)(&zcrx->rq_lock); + + nr = min(nr, io_zcrx_rqring_entries(zcrx)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + + zcrx_return_buffers(netmems, nr); + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + + return 0; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct zcrx_ctrl ctrl; @@ -956,10 +1021,13 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); if (!zcrx) return -ENXIO; - if (ctrl.op >= __ZCRX_CTRL_LAST) - return -EOPNOTSUPP; - return -EINVAL; + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + } + + return -EOPNOTSUPP; } static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, -- cgit v1.2.3 From d7af80b213e5675664b14f12240cb282e81773d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:16 +0000 Subject: io_uring/zcrx: export zcrx via a file Add an option to wrap a zcrx instance into a file and expose it to the user space. Currently, users can't do anything meaningful with the file, but it'll be used in a next patch to import it into another io_uring instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the IORING_REGISTER_ZCRX_CTRL registration opcode. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 11 ++++++- io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index db47fced2cc6..4bedc0310a55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1083,6 +1083,7 @@ struct io_uring_zcrx_ifq_reg { enum zcrx_ctrl_op { ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, __ZCRX_CTRL_LAST, }; @@ -1091,12 +1092,20 @@ struct zcrx_ctrl_flush_rq { __u64 __resv[6]; }; +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ __u64 __resv[2]; - struct zcrx_ctrl_flush_rq zc_flush; + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e60c5c00a611..815992aff246 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -596,6 +606,55 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -742,12 +801,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - - if (refcount_dec_and_test(&ifq->user_refs)) { - io_close_queue(ifq); - io_zcrx_scrub(ifq); - } - io_put_zcrx_ifq(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -1028,6 +1082,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) switch (ctrl.op) { case ZCRX_CTRL_FLUSH_RQ: return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); } return -EOPNOTSUPP; -- cgit v1.2.3 From 00d91481279fb2df8c46d19090578afd523ca630 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:18 +0000 Subject: io_uring/zcrx: share an ifq between rings Add a way to share an ifq from a src ring that is real (i.e. bound to a HW RX queue) with other rings. This is done by passing a new flag IORING_ZCRX_IFQ_REG_IMPORT in the registration struct io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 +++ io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4bedc0310a55..deb772222b6d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index da7e556c349e..b99cf2c6670a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -660,6 +660,63 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, return 0; } +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -685,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { -- cgit v1.2.3