diff options
author | Jens Axboe <axboe@kernel.dk> | 2024-10-22 22:47:00 +0300 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2024-10-29 22:43:28 +0300 |
commit | aa00f67adc2c0d6439f81b5a81ff181377c47a7e (patch) | |
tree | 816f24bc161dcf481d895ef1a45dbcd495209802 /io_uring/register.c | |
parent | 371b47da25e1f7a1a6323f84c776bd9fa079a490 (diff) | |
download | linux-aa00f67adc2c0d6439f81b5a81ff181377c47a7e.tar.xz |
io_uring: add support for fixed wait regions
Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.
Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.
At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:
struct io_uring_reg_wait *reg;
reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);
/* set timeout and mark as set, sigmask/sigmask_sz as needed */
reg->ts.tv_sec = 0;
reg->ts.tv_nsec = 100000;
reg->flags = IORING_REG_WAIT_TS;
where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:
struct __kernel_timespec timeout = { .tv_nsec = 100000, };
io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);
to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:
io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);
to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:
struct io_uring_reg_wait {
struct __kernel_timespec ts;
__u32 min_wait_usec;
__u32 flags;
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad[3];
__u64 pad2[2];
};
embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.
The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.
The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.
In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.
Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/register.c')
-rw-r--r-- | io_uring/register.c | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/io_uring/register.c b/io_uring/register.c index fc6c94d694b2..1eb686eaa310 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,6 +570,82 @@ out: return ret; } +void io_unregister_cqwait_reg(struct io_ring_ctx *ctx) +{ + unsigned short npages = 1; + + if (!ctx->cq_wait_page) + return; + + io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true); + ctx->cq_wait_arg = NULL; + if (ctx->user) + __io_unaccount_mem(ctx->user, 1); +} + +/* + * Register a page holding N entries of struct io_uring_reg_wait, which can + * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. + * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing + * in a pointer for a struct io_uring_getevents_arg, an index into this + * registered array is passed, avoiding two (arg + timeout) copies per + * invocation. + */ +static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_cqwait_reg_arg arg; + struct io_uring_reg_wait *reg; + struct page **pages; + unsigned long len; + int nr_pages, poff; + int ret; + + if (ctx->cq_wait_page || ctx->cq_wait_arg) + return -EBUSY; + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + if (!arg.nr_entries || arg.flags) + return -EINVAL; + if (arg.struct_size != sizeof(*reg)) + return -EINVAL; + if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len)) + return -EOVERFLOW; + if (len > PAGE_SIZE) + return -EINVAL; + /* offset + len must fit within a page, and must be reg_wait aligned */ + poff = arg.user_addr & ~PAGE_MASK; + if (len + poff > PAGE_SIZE) + return -EINVAL; + if (poff % arg.struct_size) + return -EINVAL; + + pages = io_pin_pages(arg.user_addr, len, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = -EINVAL; + if (nr_pages != 1) + goto out_free; + if (ctx->user) { + ret = __io_account_mem(ctx->user, 1); + if (ret) + goto out_free; + } + + reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL); + if (reg) { + ctx->cq_wait_index = arg.nr_entries - 1; + WRITE_ONCE(ctx->cq_wait_page, pages); + WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff); + return 0; + } + ret = -ENOMEM; + if (ctx->user) + __io_unaccount_mem(ctx->user, 1); +out_free: + io_pages_free(&pages, nr_pages); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; + case IORING_REGISTER_CQWAIT_REG: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_cqwait_reg(ctx, arg); + break; default: ret = -EINVAL; break; |