diff options
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 219 |
1 files changed, 157 insertions, 62 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 33597284e1cb..29adfc6d6ec2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -71,6 +71,7 @@ #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/vmalloc.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS @@ -371,24 +372,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -436,7 +419,6 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; BUG_ON(!tctx); @@ -461,8 +443,6 @@ static void io_queue_iowq(struct io_kiocb *req) trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } static __cold void io_queue_deferred(struct io_ring_ctx *ctx) @@ -647,6 +627,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) * to care for a non-real case. */ if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); @@ -1673,7 +1654,7 @@ queue: spin_unlock(&ctx->completion_lock); io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { ret = -ENOMEM; io_req_complete_failed(req, ret); @@ -1740,17 +1721,24 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return !!req->file; } +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; if (unlikely(!io_assign_file(req, issue_flags))) return -EBADF; - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1760,8 +1748,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) @@ -1808,8 +1800,6 @@ void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { fail: @@ -1907,15 +1897,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_complete_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1928,9 +1914,6 @@ static void io_queue_async(struct io_kiocb *req, int ret) case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) @@ -1944,9 +1927,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) - io_arm_ltimeout(req); - else + if (unlikely(ret)) io_queue_async(req, ret); } @@ -2513,23 +2494,118 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_mem_free(void *ptr) +static void io_pages_unmap(void *ptr, struct page ***pages, + unsigned short *npages) { - struct page *page; + bool do_vunmap = false; if (!ptr) return; - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); + if (*npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + +static void io_rings_free(struct io_ring_ctx *ctx) +{ + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); + ctx->rings = NULL; + ctx->sq_sqes = NULL; +} + +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > 10) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); } -static void *io_mem_alloc(size_t size) +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) { - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } - return (void *) __get_free_pages(gfp, get_order(size)); + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +static void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + if (nr_pages == 1) + goto fail; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } +fail: + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; } static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, @@ -2680,8 +2756,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mmdrop(ctx->mm_account); ctx->mm_account = NULL; } - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_rings_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3114,11 +3189,9 @@ static void *io_uring_validate_mmap_request(struct file *file, switch (offset) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - ptr = ctx->rings; - break; + return ctx->rings; case IORING_OFF_SQES: - ptr = ctx->sq_sqes; - break; + return ctx->sq_sqes; default: return ERR_PTR(-EINVAL); } @@ -3130,11 +3203,23 @@ static void *io_uring_validate_mmap_request(struct file *file, return ptr; } +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vma->vm_flags |= VM_DONTEXPAND; + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { + struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; unsigned long pfn; void *ptr; @@ -3142,6 +3227,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(ptr)) return PTR_ERR(ptr); + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + } + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } @@ -3422,6 +3517,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, { struct io_rings *rings; size_t size, sq_array_offset; + void *ptr; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3431,9 +3527,9 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); + if (IS_ERR(rings)) + return PTR_ERR(rings); ctx->rings = rings; ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); @@ -3447,18 +3543,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, else size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; + io_rings_free(ctx); return -EOVERFLOW; } - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); + if (IS_ERR(ptr)) { + io_rings_free(ctx); + return PTR_ERR(ptr); } + ctx->sq_sqes = ptr; return 0; } |