From 81cf17cd3ab3e5441e876a8e9e9c38ae9920cecb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 11:01:45 -0600 Subject: io_uring/kbuf: rename struct io_uring_buf_reg 'pad' to'flags' In preparation for allowing flags to be set for registration, rename the padding and use it for that. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 709de6d4feb2..c3f3ea997f3a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -640,7 +640,7 @@ struct io_uring_buf_reg { __u64 ring_addr; __u32 ring_entries; __u16 bgid; - __u16 pad; + __u16 flags; __u64 resv[3]; }; -- cgit v1.2.3 From c56e022c0a27142b7b59ae6bdf45f86bf4b298a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Mar 2023 11:07:19 -0600 Subject: io_uring: add support for user mapped provided buffer ring The ring mapped provided buffer rings rely on the application allocating the memory for the ring, and then the kernel will map it. This generally works fine, but runs into issues on some architectures where we need to be able to ensure that the kernel and application virtual address for the ring play nicely together. This at least impacts architectures that set SHM_COLOUR, but potentially also anyone setting SHMLBA. To use this variant of ring provided buffers, the application need not allocate any memory for the ring. Instead the kernel will do so, and the allocation must subsequently call mmap(2) on the ring with the offset set to: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get a virtual address for the buffer ring. Normally the application would allocate a suitable piece of memory (and correctly aligned) and simply pass that in via io_uring_buf_reg.ring_addr and the kernel would map it. Outside of the setup differences, the kernel allocate + user mapped provided buffer ring works exactly the same. Acked-by: Helge Deller Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 17 ++++++++ io_uring/io_uring.c | 13 +++++- io_uring/kbuf.c | 99 +++++++++++++++++++++++++++++++++---------- io_uring/kbuf.h | 4 ++ 4 files changed, 109 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index c3f3ea997f3a..1d59c816a5b8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -389,6 +389,9 @@ enum { #define IORING_OFF_SQ_RING 0ULL #define IORING_OFF_CQ_RING 0x8000000ULL #define IORING_OFF_SQES 0x10000000ULL +#define IORING_OFF_PBUF_RING 0x80000000ULL +#define IORING_OFF_PBUF_SHIFT 16 +#define IORING_OFF_MMAP_MASK 0xf8000000ULL /* * Filled with the offset for mmap(2) @@ -635,6 +638,20 @@ struct io_uring_buf_ring { }; }; +/* + * Flags for IORING_REGISTER_PBUF_RING. + * + * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring. + * The application must not set a ring_addr in struct + * io_uring_buf_reg, instead it must subsequently call + * mmap(2) with the offset set as: + * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) + * to get a virtual mapping for the ring. + */ +enum { + IOU_PBUF_RING_MMAP = 1, +}; + /* argument for IORING_(UN)REGISTER_PBUF_RING */ struct io_uring_buf_reg { __u64 ring_addr; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b49b7ee12d60..d72aa92ce2d6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3289,7 +3289,7 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; - switch (offset) { + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; @@ -3297,6 +3297,17 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; + case IORING_OFF_PBUF_RING: { + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + mutex_lock(&ctx->uring_lock); + ptr = io_pbuf_get_address(ctx, bgid); + mutex_unlock(&ctx->uring_lock); + if (!ptr) + return ERR_PTR(-EINVAL); + break; + } default: return ERR_PTR(-EINVAL); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4b2f4a0ee962..cd1d9dddf58e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return NULL; head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + /* mmaped buffers are always contig */ + if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { buf = &br->bufs[head]; } else { int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); @@ -214,15 +215,27 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_mapped && bl->buf_nr_pages) { - int j; - + if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (bl->is_mmap) { + if (bl->buf_ring) { + struct page *page; + + page = virt_to_head_page(bl->buf_ring); + if (put_page_testzero(page)) + free_compound_page(page); + bl->buf_ring = NULL; + } + bl->is_mmap = 0; + } else if (bl->buf_nr_pages) { + int j; + + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->is_mapped = 0; @@ -482,6 +495,25 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_nr_pages = nr_pages; bl->buf_ring = br; bl->is_mapped = 1; + bl->is_mmap = 0; + return 0; +} + +static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t ring_size; + void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + if (!ptr) + return -ENOMEM; + + bl->buf_ring = ptr; + bl->is_mapped = 1; + bl->is_mmap = 1; return 0; } @@ -496,12 +528,18 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) + if (reg.flags & ~IOU_PBUF_RING_MMAP) return -EINVAL; + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + } else { + if (reg.ring_addr) + return -EINVAL; + } + if (!is_power_of_2(reg.ring_entries)) return -EINVAL; @@ -526,17 +564,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - ret = io_pin_pbuf_ring(®, bl); - if (ret) { - kfree(free_bl); - return ret; - } + if (!(reg.flags & IOU_PBUF_RING_MMAP)) + ret = io_pin_pbuf_ring(®, bl); + else + ret = io_alloc_pbuf_ring(®, bl); - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; + if (!ret) { + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; + } + + kfree(free_bl); + return ret; } int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) @@ -564,3 +606,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } return 0; } + +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +{ + struct io_buffer_list *bl; + + bl = io_buffer_get_list(ctx, bgid); + if (!bl || !bl->is_mmap) + return NULL; + + return bl->buf_ring; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 61b9c7dade9d..d14345ef61fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -26,6 +26,8 @@ struct io_buffer_list { /* ring mapped provided buffers */ __u8 is_mapped; + /* ring mapped provided buffers, but mmap'ed by application */ + __u8 is_mmap; }; struct io_buffer { @@ -53,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); + static inline void io_kbuf_recycle_ring(struct io_kiocb *req) { /* -- cgit v1.2.3 From d322818ef4c752d79cd667474418691237aa9ccf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 27 Mar 2023 16:34:48 +0100 Subject: io_uring: kill unused notif declarations There are two leftover structures from the notification registration mechanism that has never been released, kill them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f05f65aebaf8b1b5bf28519a8fdb350e3e7c9ad0.1679924536.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1d59c816a5b8..f8d14d1c58d3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -571,19 +571,6 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; -struct io_uring_notification_slot { - __u64 tag; - __u64 resv[3]; -}; - -struct io_uring_notification_register { - __u32 nr_slots; - __u32 resv; - __u64 resv2; - __u64 data; - __u64 resv3; -}; - /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) -- cgit v1.2.3 From ea97f6c8558e83cb457c3b5f53351e4fd8519ab1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 18 Apr 2023 15:58:18 -0700 Subject: io_uring: add support for multishot timeouts A multishot timeout submission will repeatedly generate completions with the IORING_CQE_F_MORE cflag set. Depending on the value of the `off' field in the submission, these timeouts can either repeat indefinitely until cancelled (`off' = 0) or for a fixed number of times (`off' > 0). Only noseq timeouts (i.e. not dependent on the number of I/O completions) are supported. An indefinite timer will be cancelled if the CQ ever overflows. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20230418225817.1905027-1-davidhwei@meta.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/timeout.c | 57 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f8d14d1c58d3..0716cb17e436 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -250,6 +250,7 @@ enum io_uring_op { #define IORING_TIMEOUT_REALTIME (1U << 3) #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) +#define IORING_TIMEOUT_MULTISHOT (1U << 6) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 5c6c6f720809..fc950177e2e1 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -17,6 +17,7 @@ struct io_timeout { struct file *file; u32 off; u32 target_seq; + u32 repeats; struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; @@ -37,8 +38,9 @@ struct io_timeout_rem { static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; - return !timeout->off; + return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT; } static inline void io_put_req(struct io_kiocb *req) @@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req) } } +static inline bool io_timeout_finish(struct io_timeout *timeout, + struct io_timeout_data *data) +{ + if (!(data->flags & IORING_TIMEOUT_MULTISHOT)) + return true; + + if (!timeout->off || (timeout->repeats && --timeout->repeats)) + return false; + + return true; +} + +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); + +static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; + struct io_ring_ctx *ctx = req->ctx; + + if (!io_timeout_finish(timeout, data)) { + bool filled; + filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME, + IORING_CQE_F_MORE, false); + if (filled) { + /* re-arm timer */ + spin_lock_irq(&ctx->timeout_lock); + list_add(&timeout->list, ctx->timeout_list.prev); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + spin_unlock_irq(&ctx->timeout_lock); + return; + } + } + + io_req_task_complete(req, ts); +} + static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->timeout_lock) { @@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) req_set_fail(req); io_req_set_res(req, -ETIME, 0); - req->io_task_work.func = io_req_task_complete; + req->io_task_work.func = io_timeout_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -470,16 +510,27 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) + IORING_TIMEOUT_ETIME_SUCCESS | + IORING_TIMEOUT_MULTISHOT)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + /* multishot requests only make sense with rel values */ + if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS))) + return -EINVAL; INIT_LIST_HEAD(&timeout->list); timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; + /* + * for multishot reqs w/ fixed nr of repeats, repeats tracks the + * remaining nr + */ + timeout->repeats = 0; + if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0) + timeout->repeats = off; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; -- cgit v1.2.3