summaryrefslogtreecommitdiff
path: root/io_uring/register.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2024-10-22 22:47:00 +0300
committerJens Axboe <axboe@kernel.dk>2024-10-29 22:43:28 +0300
commitaa00f67adc2c0d6439f81b5a81ff181377c47a7e (patch)
tree816f24bc161dcf481d895ef1a45dbcd495209802 /io_uring/register.c
parent371b47da25e1f7a1a6323f84c776bd9fa079a490 (diff)
downloadlinux-aa00f67adc2c0d6439f81b5a81ff181377c47a7e.tar.xz
io_uring: add support for fixed wait regions
Generally applications have 1 or a few waits of waiting, yet they pass in a struct io_uring_getevents_arg every time. This needs to get copied and, in turn, the timeout value needs to get copied. Rather than do this for every invocation, allow the application to register a fixed set of wait regions that can simply be indexed when asking the kernel to wait on events. At ring setup time, the application can register a number of these wait regions and initialize region/index 0 upfront: struct io_uring_reg_wait *reg; reg = io_uring_setup_reg_wait(ring, nr_regions, &ret); /* set timeout and mark as set, sigmask/sigmask_sz as needed */ reg->ts.tv_sec = 0; reg->ts.tv_nsec = 100000; reg->flags = IORING_REG_WAIT_TS; where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The above initializes index 0, but 63 other regions can be initialized, if needed. Now, instead of doing: struct __kernel_timespec timeout = { .tv_nsec = 100000, }; io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL); to wait for events for each submit_and_wait, or just wait, operation, it can just reference the above region at offset 0 and do: io_uring_submit_and_wait_reg(ring, &cqe, nr, 0); to achieve the same goal of waiting 100usec without needing to copy both struct io_uring_getevents_arg (24b) and struct __kernel_timeout (16b) for each invocation. Struct io_uring_reg_wait looks as follows: struct io_uring_reg_wait { struct __kernel_timespec ts; __u32 min_wait_usec; __u32 flags; __u64 sigmask; __u32 sigmask_sz; __u32 pad[3]; __u64 pad2[2]; }; embedding the timeout itself in the region, rather than passing it as a pointer as well. Note that the signal mask is still passed as a pointer, both for compatability reasons, but also because there doesn't seem to be a lot of high frequency waits scenarios that involve setting and resetting the signal mask for each wait. The application is free to modify any region before a wait call, or it can use keep multiple regions with different settings to avoid needing to modify the same one for wait calls. Up to a page size of regions is mapped by default, allowing PAGE_SIZE / 64 available regions for use. The registered region must fit within a page. On a 4kb page size system, that allows for 64 wait regions if a full page is used, as the size of struct io_uring_reg_wait is 64b. The region registered must be aligned to io_uring_reg_wait in size. It's valid to register less than 64 entries. In network performance testing with zero-copy, this reduced the time spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4% to 0.3%. Wait regions are fixed for the lifetime of the ring - once registered, they are persistent until the ring is torn down. The regions support minimum wait timeout as well as the regular waits. Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/register.c')
-rw-r--r--io_uring/register.c82
1 files changed, 82 insertions, 0 deletions
diff --git a/io_uring/register.c b/io_uring/register.c
index fc6c94d694b2..1eb686eaa310 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,82 @@ out:
return ret;
}
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
+{
+ unsigned short npages = 1;
+
+ if (!ctx->cq_wait_page)
+ return;
+
+ io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
+ ctx->cq_wait_arg = NULL;
+ if (ctx->user)
+ __io_unaccount_mem(ctx->user, 1);
+}
+
+/*
+ * Register a page holding N entries of struct io_uring_reg_wait, which can
+ * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
+ * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
+ * in a pointer for a struct io_uring_getevents_arg, an index into this
+ * registered array is passed, avoiding two (arg + timeout) copies per
+ * invocation.
+ */
+static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
+{
+ struct io_uring_cqwait_reg_arg arg;
+ struct io_uring_reg_wait *reg;
+ struct page **pages;
+ unsigned long len;
+ int nr_pages, poff;
+ int ret;
+
+ if (ctx->cq_wait_page || ctx->cq_wait_arg)
+ return -EBUSY;
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+ if (!arg.nr_entries || arg.flags)
+ return -EINVAL;
+ if (arg.struct_size != sizeof(*reg))
+ return -EINVAL;
+ if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
+ return -EOVERFLOW;
+ if (len > PAGE_SIZE)
+ return -EINVAL;
+ /* offset + len must fit within a page, and must be reg_wait aligned */
+ poff = arg.user_addr & ~PAGE_MASK;
+ if (len + poff > PAGE_SIZE)
+ return -EINVAL;
+ if (poff % arg.struct_size)
+ return -EINVAL;
+
+ pages = io_pin_pages(arg.user_addr, len, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = -EINVAL;
+ if (nr_pages != 1)
+ goto out_free;
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, 1);
+ if (ret)
+ goto out_free;
+ }
+
+ reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+ if (reg) {
+ ctx->cq_wait_index = arg.nr_entries - 1;
+ WRITE_ONCE(ctx->cq_wait_page, pages);
+ WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
+ return 0;
+ }
+ ret = -ENOMEM;
+ if (ctx->user)
+ __io_unaccount_mem(ctx->user, 1);
+out_free:
+ io_pages_free(&pages, nr_pages);
+ return ret;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_resize_rings(ctx, arg);
break;
+ case IORING_REGISTER_CQWAIT_REG:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_cqwait_reg(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;