diff options
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 602 |
1 files changed, 2 insertions, 600 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb4f3ffa58c8..e395167999ed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" +#include "kbuf.h" #include "xattr.h" #include "nop.h" @@ -171,42 +172,10 @@ struct io_rsrc_data { bool quiesce; }; -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u16 head; - __u16 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -#define BGID_ARRAY 64 - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> @@ -226,15 +195,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_provide_buf { - struct file *file; - __u64 addr; - __u32 len; - __u32 bgid; - __u16 nbufs; - __u16 bid; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -399,110 +359,6 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) - req->buf_list->head++; - req->flags &= ~REQ_F_BUFFER_RING; - } else { - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } - - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); -} - -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); -} - -inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) -{ - unsigned int cflags; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; -} - -static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - struct io_buffer *buf; - - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; - } - - io_ring_submit_lock(ctx, issue_flags); - - buf = req->kbuf; - bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -2296,96 +2152,6 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, req->imu); } -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); -} - -static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl) -{ - if (!list_empty(&bl->buf_list)) { - struct io_buffer *kbuf; - - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&kbuf->list); - if (*len > kbuf->len) - *len = kbuf->len; - req->flags |= REQ_F_BUFFER_SELECTED; - req->kbuf = kbuf; - req->buf_index = kbuf->bid; - return u64_to_user_ptr(kbuf->addr); - } - return NULL; -} - -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) -{ - struct io_uring_buf_ring *br = bl->buf_ring; - struct io_uring_buf *buf; - __u16 head = bl->head; - - if (unlikely(smp_load_acquire(&br->tail) == head)) - return NULL; - - head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } - if (*len > buf->len) - *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; - req->buf_index = buf->bid; - - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). - */ - req->buf_list = NULL; - bl->head++; - } - return u64_to_user_ptr(buf->addr); -} - -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - void __user *ret = NULL; - - io_ring_submit_lock(req->ctx, issue_flags); - - bl = io_buffer_get_list(ctx, req->buf_index); - if (likely(bl)) { - if (bl->buf_nr_pages) - ret = io_ring_buffer_select(req, len, bl, issue_flags); - else - ret = io_provided_buffer_select(req, len, bl); - } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) @@ -3098,258 +2864,6 @@ err: return ret; } -static int io_remove_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || - sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -EINVAL; - - memset(p, 0, sizeof(*p)); - p->nbufs = tmp; - p->bgid = READ_ONCE(sqe->buf_group); - return 0; -} - -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) -{ - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->buf_nr_pages) { - int j; - - i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - return i; - } - - /* the head kbuf is the list itself */ - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&nxt->list); - if (++i == nbufs) - return i; - cond_resched(); - } - i++; - - return i; -} - -static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - if (ret < 0) - req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_provide_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - unsigned long size, tmp_check; - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -E2BIG; - p->nbufs = tmp; - p->addr = READ_ONCE(sqe->addr); - p->len = READ_ONCE(sqe->len); - - if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, - &size)) - return -EOVERFLOW; - if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) - return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) - return -EFAULT; - - p->bgid = READ_ONCE(sqe->buf_group); - tmp = READ_ONCE(sqe->off); - if (tmp > USHRT_MAX) - return -E2BIG; - p->bid = tmp; - return 0; -} - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - struct page *page; - int bufs_in_page; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. - */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; - } - - return 0; -} - -static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer_list *bl) -{ - struct io_buffer *buf; - u64 addr = pbuf->addr; - int i, bid = pbuf->bid; - - for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) - break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); - buf->addr = addr; - buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); - buf->bid = bid; - buf->bgid = pbuf->bgid; - addr += pbuf->len; - bid++; - cond_resched(); - } - - return i ? 0 : -ENOMEM; -} - -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - int i; - - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; - } - - return 0; -} - -static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); - if (ret) { - kfree(bl); - goto err; - } - } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { - ret = -EINVAL; - goto err; - } - - ret = io_add_buffers(ctx, p, bl); -err: - if (ret < 0) - req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { @@ -5218,8 +4732,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, - int *npages) +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; @@ -5543,33 +5056,6 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static void io_destroy_buffers(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } - - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } - - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); - } -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; @@ -6953,89 +6439,6 @@ err: return ret; } -static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; - struct page **pages; - int nr_pages; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) - return -EINVAL; - - /* cannot disambiguate full vs empty due to head/tail size */ - if (reg.ring_entries >= 65536) - return -EINVAL; - - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { - /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) - return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; - } - - pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(free_bl); - return PTR_ERR(pages); - } - - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; -} - -static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - - bl = io_buffer_get_list(ctx, reg.bgid); - if (!bl) - return -ENOENT; - if (!bl->buf_nr_pages) - return -EINVAL; - - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); - } - return 0; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -7701,7 +7104,6 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); |