diff options
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 156 |
1 files changed, 71 insertions, 85 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0b0dfef93480..7370f763346f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,7 @@ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -143,7 +143,8 @@ struct io_defer_entry { static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all); + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); @@ -156,7 +157,7 @@ static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL -static struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, @@ -314,16 +315,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, - sizeof(struct async_poll)); + sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_msghdr)); + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_rw)); + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_uring_cmd_data)); + sizeof(struct io_uring_cmd_data), 0); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb)); + sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); if (ret) goto free_ref; @@ -350,7 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); - mutex_init(&ctx->resize_lock); + mutex_init(&ctx->mmap_lock); return ctx; @@ -361,7 +364,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); @@ -550,8 +553,9 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -562,6 +566,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) io_req_task_queue(de->req); kfree(de); } + spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -570,11 +575,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); + if (ctx->drain_active) io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } @@ -897,7 +899,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; @@ -1401,6 +1403,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1440,7 +1448,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); @@ -1640,19 +1653,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -1818,7 +1818,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1829,7 +1829,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } @@ -2633,36 +2633,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; - vunmap(ctx->rings); - vunmap(ctx->sq_sqes); - } - + io_free_region(ctx, &ctx->sq_region); + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -2734,7 +2708,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); @@ -2896,7 +2870,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -3064,7 +3039,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all) + bool cancel_all, + bool is_sqpoll_thread) { struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; @@ -3094,7 +3070,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3167,13 +3143,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) continue; loop |= io_uring_try_cancel_requests(node->ctx, current->io_uring, - cancel_all); + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current->io_uring, - cancel_all); + cancel_all, + true); } if (loop) { @@ -3480,9 +3458,10 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3493,15 +3472,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; @@ -3518,17 +3499,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } - - ctx->sq_sqes = ptr; + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } @@ -3736,7 +3718,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3897,6 +3880,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != @@ -3937,6 +3922,7 @@ static int __init io_uring_init(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); |