summaryrefslogtreecommitdiff
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c219
1 files changed, 157 insertions, 62 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 33597284e1cb..29adfc6d6ec2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -71,6 +71,7 @@
#include <linux/io_uring.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/vmalloc.h>
#include <asm/shmparam.h>
#define CREATE_TRACE_POINTS
@@ -371,24 +372,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
return req->link;
}
-static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
-{
- if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
- return NULL;
- return __io_prep_linked_timeout(req);
-}
-
-static noinline void __io_arm_ltimeout(struct io_kiocb *req)
-{
- io_queue_linked_timeout(__io_prep_linked_timeout(req));
-}
-
-static inline void io_arm_ltimeout(struct io_kiocb *req)
-{
- if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
- __io_arm_ltimeout(req);
-}
-
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -436,7 +419,6 @@ static void io_prep_async_link(struct io_kiocb *req)
static void io_queue_iowq(struct io_kiocb *req)
{
- struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
BUG_ON(!tctx);
@@ -461,8 +443,6 @@ static void io_queue_iowq(struct io_kiocb *req)
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
- if (link)
- io_queue_linked_timeout(link);
}
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
@@ -647,6 +627,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
* to care for a non-real case.
*/
if (need_resched()) {
+ ctx->cqe_sentinel = ctx->cqe_cached;
io_cq_unlock_post(ctx);
mutex_unlock(&ctx->uring_lock);
cond_resched();
@@ -1673,7 +1654,7 @@ queue:
spin_unlock(&ctx->completion_lock);
io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
+ de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
ret = -ENOMEM;
io_req_complete_failed(req, ret);
@@ -1740,17 +1721,24 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
return !!req->file;
}
+#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT)
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
const struct cred *creds = NULL;
+ struct io_kiocb *link = NULL;
int ret;
if (unlikely(!io_assign_file(req, issue_flags)))
return -EBADF;
- if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
- creds = override_creds(req->creds);
+ if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) {
+ if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ creds = override_creds(req->creds);
+ if (req->flags & REQ_F_ARM_LTIMEOUT)
+ link = __io_prep_linked_timeout(req);
+ }
if (!def->audit_skip)
audit_uring_entry(req->opcode);
@@ -1760,8 +1748,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (!def->audit_skip)
audit_uring_exit(!ret, ret);
- if (creds)
- revert_creds(creds);
+ if (unlikely(creds || link)) {
+ if (creds)
+ revert_creds(creds);
+ if (link)
+ io_queue_linked_timeout(link);
+ }
if (ret == IOU_OK) {
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
@@ -1808,8 +1800,6 @@ void io_wq_submit_work(struct io_wq_work *work)
else
req_ref_get(req);
- io_arm_ltimeout(req);
-
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
fail:
@@ -1907,15 +1897,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
static void io_queue_async(struct io_kiocb *req, int ret)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout;
-
if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
io_req_complete_failed(req, ret);
return;
}
- linked_timeout = io_prep_linked_timeout(req);
-
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_kbuf_recycle(req, 0);
@@ -1928,9 +1914,6 @@ static void io_queue_async(struct io_kiocb *req, int ret)
case IO_APOLL_OK:
break;
}
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
}
static inline void io_queue_sqe(struct io_kiocb *req)
@@ -1944,9 +1927,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret))
- io_arm_ltimeout(req);
- else
+ if (unlikely(ret))
io_queue_async(req, ret);
}
@@ -2513,23 +2494,118 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_mem_free(void *ptr)
+static void io_pages_unmap(void *ptr, struct page ***pages,
+ unsigned short *npages)
{
- struct page *page;
+ bool do_vunmap = false;
if (!ptr)
return;
- page = virt_to_head_page(ptr);
- if (put_page_testzero(page))
- free_compound_page(page);
+ if (*npages) {
+ struct page **to_free = *pages;
+ int i;
+
+ /*
+ * Only did vmap for the non-compound multiple page case.
+ * For the compound page, we just need to put the head.
+ */
+ if (PageCompound(to_free[0]))
+ *npages = 1;
+ else if (*npages > 1)
+ do_vunmap = true;
+ for (i = 0; i < *npages; i++)
+ put_page(to_free[i]);
+ }
+ if (do_vunmap)
+ vunmap(ptr);
+ kvfree(*pages);
+ *pages = NULL;
+ *npages = 0;
+}
+
+static void io_rings_free(struct io_ring_ctx *ctx)
+{
+ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
+ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
+ ctx->rings = NULL;
+ ctx->sq_sqes = NULL;
+}
+
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+ size_t size, gfp_t gfp)
+{
+ struct page *page;
+ int i, order;
+
+ order = get_order(size);
+ if (order > 10)
+ return ERR_PTR(-ENOMEM);
+ else if (order)
+ gfp |= __GFP_COMP;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_pages; i++)
+ pages[i] = page + i;
+
+ return page_address(page);
}
-static void *io_mem_alloc(size_t size)
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+ gfp_t gfp)
{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ void *ret;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(gfp);
+ if (!pages[i])
+ goto err;
+ }
- return (void *) __get_free_pages(gfp, get_order(size));
+ ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (ret)
+ return ret;
+err:
+ while (i--)
+ put_page(pages[i]);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages;
+ int nr_pages;
+ void *ret;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+ if (!IS_ERR(ret))
+ goto done;
+ if (nr_pages == 1)
+ goto fail;
+
+ ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+ if (!IS_ERR(ret)) {
+done:
+ *out_pages = pages;
+ *npages = nr_pages;
+ return ret;
+ }
+fail:
+ kvfree(pages);
+ *out_pages = NULL;
+ *npages = 0;
+ return ret;
}
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
@@ -2680,8 +2756,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
+ io_rings_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@@ -3114,11 +3189,9 @@ static void *io_uring_validate_mmap_request(struct file *file,
switch (offset) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
- ptr = ctx->rings;
- break;
+ return ctx->rings;
case IORING_OFF_SQES:
- ptr = ctx->sq_sqes;
- break;
+ return ctx->sq_sqes;
default:
return ERR_PTR(-EINVAL);
}
@@ -3130,11 +3203,23 @@ static void *io_uring_validate_mmap_request(struct file *file,
return ptr;
}
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages)
+{
+ unsigned long nr_pages = npages;
+
+ vma->vm_flags |= VM_DONTEXPAND;
+ return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
#ifdef CONFIG_MMU
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
+ long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned int npages;
unsigned long pfn;
void *ptr;
@@ -3142,6 +3227,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
if (IS_ERR(ptr))
return PTR_ERR(ptr);
+ switch (offset & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
+ case IORING_OFF_SQES:
+ return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+ ctx->n_sqe_pages);
+ }
+
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
@@ -3422,6 +3517,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
{
struct io_rings *rings;
size_t size, sq_array_offset;
+ void *ptr;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
@@ -3431,9 +3527,9 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
- rings = io_mem_alloc(size);
- if (!rings)
- return -ENOMEM;
+ rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
+ if (IS_ERR(rings))
+ return PTR_ERR(rings);
ctx->rings = rings;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
@@ -3447,18 +3543,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
else
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
+ io_rings_free(ctx);
return -EOVERFLOW;
}
- ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
- return -ENOMEM;
+ ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
+ if (IS_ERR(ptr)) {
+ io_rings_free(ctx);
+ return PTR_ERR(ptr);
}
+ ctx->sq_sqes = ptr;
return 0;
}