diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/advise.c | 29 | ||||
-rw-r--r-- | io_uring/fs.c | 20 | ||||
-rw-r--r-- | io_uring/io_uring.c | 474 | ||||
-rw-r--r-- | io_uring/io_uring.h | 97 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 31 | ||||
-rw-r--r-- | io_uring/net.c | 21 | ||||
-rw-r--r-- | io_uring/notif.c | 3 | ||||
-rw-r--r-- | io_uring/opdef.c | 340 | ||||
-rw-r--r-- | io_uring/opdef.h | 13 | ||||
-rw-r--r-- | io_uring/openclose.c | 18 | ||||
-rw-r--r-- | io_uring/poll.c | 2 | ||||
-rw-r--r-- | io_uring/rw.c | 13 | ||||
-rw-r--r-- | io_uring/splice.c | 7 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 3 | ||||
-rw-r--r-- | io_uring/sqpoll.h | 2 | ||||
-rw-r--r-- | io_uring/statx.c | 4 | ||||
-rw-r--r-- | io_uring/sync.c | 14 | ||||
-rw-r--r-- | io_uring/xattr.c | 14 |
18 files changed, 691 insertions, 414 deletions
diff --git a/io_uring/advise.c b/io_uring/advise.c index 449c6f14649f..7085804c513c 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); + req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; @@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); @@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) #endif } +static bool io_fadvise_force_async(struct io_fadvise *fa) +{ + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + return false; + default: + return true; + } +} + int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); @@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); + if (io_fadvise_force_async(fa)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) diff --git a/io_uring/fs.c b/io_uring/fs.c index 7100c293c13a..f6a69a549fd4 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ren->newpath, ren->flags); @@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(un->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = do_rmdir(un->dfd, un->filename); @@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(mkd->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); @@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); @@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, lnk->newpath, lnk->flags); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db623b3185c8..3b915deb4d08 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -151,7 +151,7 @@ static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static __cold void io_fallback_tw(struct io_uring_task *tctx); -static struct kmem_cache *req_cachep; +struct kmem_cache *req_cachep; struct sock *io_uring_get_socket(struct file *file) { @@ -230,6 +230,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + kasan_poison_object_data(req_cachep, req); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -245,17 +246,15 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = false; + bool locked = true; - percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); + if (WARN_ON_ONCE(!locked)) + return; + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) @@ -316,6 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); @@ -407,7 +407,7 @@ static inline void io_arm_ltimeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { @@ -572,6 +572,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + if (ctx->poll_activated) + io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { @@ -618,6 +620,25 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + __io_cq_unlock(ctx); + io_commit_cqring_flush(ctx); + + /* + * As ->task_complete implies that the ring is single tasked, cq_wait + * may only be waited on by the current in io_cqring_wait(), but since + * it will re-check the wakeup conditions once we return we can safely + * skip waking it up. + */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { + smp_mb(); + __io_cqring_wake(ctx); + } +} + void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { @@ -645,7 +666,6 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) } } -/* Returns true if there are no backlogged entries after the flush */ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); @@ -693,7 +713,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cqring_do_overflow_flush(ctx); } -void __io_put_task(struct task_struct *task, int nr) +/* can be called by any task */ +static void io_put_task_remote(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -703,6 +724,21 @@ void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } +/* used by a task to put its own references */ +static void io_put_task_local(struct task_struct *task, int nr) +{ + task->io_uring->cached_refs += nr; +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + io_put_task_local(task, nr); + else + io_put_task_remote(task, nr); +} + void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -945,15 +981,15 @@ static void __io_req_complete_post(struct io_kiocb *req) req->link = NULL; } } + io_put_kbuf_comp(req); + io_dismantle_req(req); io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } @@ -980,7 +1016,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); @@ -1076,7 +1112,7 @@ __cold void io_free_req(struct io_kiocb *req) io_req_put_rsrc(req); io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task, 1); spin_lock(&ctx->completion_lock); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -1130,7 +1166,7 @@ static unsigned int handle_tw_list(struct llist_node *node, { unsigned int count = 0; - while (node != last) { + while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1143,10 +1179,16 @@ static unsigned int handle_tw_list(struct llist_node *node, /* if not contended, grab and improve batching */ *locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } + } else if (!*locked) + *locked = mutex_trylock(&(*ctx)->uring_lock); req->io_task_work.func(req, locked); node = next; count++; + if (unlikely(need_resched())) { + ctx_flush_and_put(*ctx, locked); + *ctx = NULL; + cond_resched(); + } } return count; @@ -1190,23 +1232,29 @@ void tctx_task_work(struct callback_head *cb) task_work); struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 1; - unsigned int count; + unsigned int loops = 0; + unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx); return; } - node = io_llist_xchg(&tctx->task_list, &fake); - count = handle_tw_list(node, &ctx, &uring_locked, NULL); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - while (node != &fake) { + do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); count += handle_tw_list(node, &ctx, &uring_locked, &fake); + + /* skip expensive cmpxchg if there are items in the list */ + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + io_submit_flush_completions(ctx); + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + } node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } + } while (node != &fake); ctx_flush_and_put(ctx, &uring_locked); @@ -1241,7 +1289,7 @@ static void io_req_local_work_add(struct io_kiocb *req) percpu_ref_put(&ctx->refs); return; } - /* need it for the following io_cqring_wake() */ + /* needed for the following wake up */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { @@ -1252,10 +1300,11 @@ static void io_req_local_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) io_eventfd_signal(ctx); - __io_cqring_wake(ctx); + + if (READ_ONCE(ctx->cq_waiting)) + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); percpu_ref_put(&ctx->refs); } @@ -1296,21 +1345,19 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) { struct llist_node *node; - struct llist_node fake; - struct llist_node *current_final = NULL; - int ret; - unsigned int loops = 1; + unsigned int loops = 0; + int ret = 0; - if (unlikely(ctx->submitter_task != current)) + if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; - - node = io_llist_xchg(&ctx->work_llist, &fake); - ret = 0; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - while (node != current_final) { + node = io_llist_xchg(&ctx->work_llist, NULL); + while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1319,26 +1366,20 @@ again: ret++; node = next; } + loops++; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); - if (node != &fake) { - loops++; - current_final = &fake; - node = io_llist_xchg(&ctx->work_llist, &fake); + if (!llist_empty(&ctx->work_llist)) goto again; - } - - if (*locked) + if (*locked) { io_submit_flush_completions(ctx); + if (!llist_empty(&ctx->work_llist)) + goto again; + } trace_io_uring_local_work_run(ctx, ret, loops); return ret; - } -int io_run_local_work(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { bool locked; int ret; @@ -1346,8 +1387,19 @@ int io_run_local_work(struct io_ring_ctx *ctx) if (llist_empty(&ctx->work_llist)) return 0; - __set_current_state(TASK_RUNNING); - locked = mutex_trylock(&ctx->uring_lock); + locked = true; + ret = __io_run_local_work(ctx, &locked); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!locked)) + mutex_lock(&ctx->uring_lock); + return ret; +} + +static int io_run_local_work(struct io_ring_ctx *ctx) +{ + bool locked = mutex_trylock(&ctx->uring_lock); + int ret; + ret = __io_run_local_work(ctx, &locked); if (locked) mutex_unlock(&ctx->uring_lock); @@ -1365,10 +1417,12 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else + if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); + else if (req->flags & REQ_F_FORCE_ASYNC) + io_queue_iowq(req, locked); + else + io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1467,7 +1521,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } } - __io_cq_unlock_post(ctx); + __io_cq_unlock_post_flush(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); @@ -1708,8 +1762,8 @@ unsigned int io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); + req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1719,20 +1773,21 @@ bool io_alloc_async_data(struct io_kiocb *req) int io_req_prep_async(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->prep_async) + if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (!io_op_defs[req->opcode].manual_alloc) { + if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } - return def->prep_async(req); + return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -1796,7 +1851,7 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); @@ -1820,9 +1875,10 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, + unsigned int issue_flags) { - if (req->file || !io_op_defs[req->opcode].needs_file) + if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) @@ -1835,11 +1891,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, issue_flags))) + if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) @@ -1889,7 +1945,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -1908,7 +1964,7 @@ fail: io_req_task_queue_fail(req, err); return; } - if (!io_assign_file(req, issue_flags)) { + if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; @@ -2104,7 +2160,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def; + const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -2122,7 +2178,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } - def = &io_op_defs[opcode]; + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2333,7 +2389,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -2351,14 +2407,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; - return &ctx->sq_sqes[head]; + *sqe = &ctx->sq_sqes[head]; + return true; } /* drop invalid entries */ ctx->cq_extra--; WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; + return false; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2379,11 +2436,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, &req))) break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { + if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } @@ -2418,13 +2473,13 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; + ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !llist_empty(&ctx->work_llist)); + !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -2443,22 +2498,25 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - struct io_ring_ctx *ctx = iowq->ctx; + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (io_run_task_work_ctx(ctx) > 0) + if (!llist_empty(&ctx->work_llist)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx) > 0) + return 1; + } + if (io_run_task_work() > 0) return 1; if (task_sigpending(current)) return -EINTR; @@ -2467,35 +2525,23 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t *timeout) + struct io_wait_queue *iowq) { - int ret; - unsigned long check_cq; - - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(ctx); - if (ret || io_should_wake(iowq)) - return ret; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) - return -EBADR; - } - if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + if (iowq->timeout == KTIME_MAX) + schedule(); + else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) return -ETIME; - - /* - * Run task_work after scheduling. If we got woken because of - * task_work being processed, run it now rather than let the caller - * do another wait loop. - */ - ret = io_run_task_work_sig(ctx); - return ret < 0 ? ret : 1; + return 0; } /* @@ -2508,23 +2554,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; - - do { - /* always run at least 1 task work to process local work */ - ret = io_run_task_work_ctx(ctx); - if (ret < 0) - return ret; - io_cqring_overflow_flush(ctx); - - /* if user messes with these they will just get an early return */ - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - } while (ret > 0); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + io_run_task_work(); + io_cqring_overflow_flush(ctx); + /* if user messes with these they will just get an early return */ + if (__io_cqring_events_user(ctx) >= min_events) + return 0; if (sig) { #ifdef CONFIG_COMPAT @@ -2539,36 +2579,69 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.timeout = KTIME_MAX; + + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } trace_io_uring_cqring_wait(ctx, min_events); do { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - finish_wait(&ctx->cq_wait, &iowq.wq); - io_cqring_do_overflow_flush(ctx); + unsigned long check_cq; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + WRITE_ONCE(ctx->cq_waiting, 1); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); + } + + ret = io_cqring_wait_schedule(ctx, &iowq); + __set_current_state(TASK_RUNNING); + WRITE_ONCE(ctx->cq_waiting, 0); + + if (ret < 0) + break; + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + io_run_task_work(); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - if (__io_cqring_events_user(ctx) >= min_events) + + if (io_should_wake(&iowq)) { + ret = 0; break; + } cond_resched(); - } while (ret > 0); + } while (1); - finish_wait(&ctx->cq_wait, &iowq.wq); + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -2683,14 +2756,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx) { + struct io_kiocb *req; int nr = 0; mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { - struct io_kiocb *req = io_alloc_req(ctx); - + req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } @@ -2762,12 +2835,54 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) kfree(ctx); } +static __cold void io_activate_pollwq_cb(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + poll_wq_task_work); + + mutex_lock(&ctx->uring_lock); + ctx->poll_activated = true; + mutex_unlock(&ctx->uring_lock); + + /* + * Wake ups for some events between start of polling and activation + * might've been lost due to loose synchronisation. + */ + wake_up_all(&ctx->poll_wq); + percpu_ref_put(&ctx->refs); +} + +static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->completion_lock); + /* already activated or in progress */ + if (ctx->poll_activated || ctx->poll_wq_task_work.func) + goto out; + if (WARN_ON_ONCE(!ctx->task_complete)) + goto out; + if (!ctx->submitter_task) + goto out; + /* + * with ->submitter_task only the submitter task completes requests, we + * only need to sync with it, which is done by injecting a tw + */ + init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); + percpu_ref_get(&ctx->refs); + if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) + percpu_ref_put(&ctx->refs); +out: + spin_unlock(&ctx->completion_lock); +} + static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + if (unlikely(!ctx->poll_activated)) + io_activate_pollwq(ctx); + + poll_wait(file, &ctx->poll_wq, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -2790,7 +2905,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * pushes them to do the flush. */ - if (io_cqring_events(ctx) || io_has_work(ctx)) + if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3053,10 +3168,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; + cond_resched(); } } - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); @@ -3328,11 +3445,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); @@ -3573,6 +3688,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->task_complete = true; /* + * lazy poll_wq activation relies on ->task_complete for synchronisation + * purposes, see io_activate_pollwq() + */ + if (!ctx->task_complete) + ctx->poll_activated = true; + + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling @@ -3663,7 +3785,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3760,7 +3882,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_op_defs[i].not_supported) + if (!io_issue_defs[i].not_supported) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -3865,8 +3987,15 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + /* + * Lazy activation attempts would fail if it was polled before + * submitter_task is set. + */ + if (wq_has_sleeper(&ctx->poll_wq)) + io_activate_pollwq(ctx); + } if (ctx->restrictions.registered) ctx->restricted = 1; @@ -4177,17 +4306,36 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, struct io_ring_ctx *ctx; long ret = -EBADF; struct fd f; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; + if (use_registered_ring) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(f.file)) - goto out_fput; + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + f.file = tctx->registered_rings[fd]; + f.flags = 0; + if (unlikely(!f.file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (!io_is_uring_fops(f.file)) + goto out_fput; + } ctx = f.file->private_data; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ab4b2a1c3b7e..2711865f1e19 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -3,6 +3,8 @@ #include <linux/errno.h> #include <linux/lockdep.h> +#include <linux/resume_user_mode.h> +#include <linux/kasan.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "io-wq.h" @@ -28,8 +30,6 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); -int io_run_local_work(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -72,7 +72,6 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); -void __io_put_task(struct task_struct *task, int nr); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); @@ -222,6 +221,13 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) +{ + if (wq_has_sleeper(&ctx->poll_wq)) + __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, + poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + /* requires smb_mb() prior, see wq_has_sleeper() */ static inline void __io_cqring_wake(struct io_ring_ctx *ctx) { @@ -270,6 +276,15 @@ static inline int io_run_task_work(void) */ if (test_thread_flag(TIF_NOTIFY_SIGNAL)) clear_notify_signal(); + /* + * PF_IO_WORKER never returns to userspace, so check here if we have + * notify work that needs processing. + */ + if (current->flags & PF_IO_WORKER && + test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } if (task_work_pending(current)) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -284,42 +299,6 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) -{ - int ret = 0; - int ret2; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - ret = io_run_local_work(ctx); - - /* want to run this after in case more is added */ - ret2 = io_run_task_work(); - - /* Try propagate error in favour of if tasks were run, - * but still make sure to run them if requested - */ - if (ret >= 0) - ret += ret2; - - return ret; -} - -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) -{ - bool locked; - int ret; - - if (llist_empty(&ctx->work_llist)) - return 0; - - locked = true; - ret = __io_run_local_work(ctx, &locked); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) - mutex_lock(&ctx->uring_lock); - return ret; -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -345,19 +324,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - static inline void io_get_task_refs(int nr) { struct io_uring_task *tctx = current->io_uring; @@ -372,19 +343,31 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) return !ctx->submit_state.free_list.next; } -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +extern struct kmem_cache *req_cachep; + +static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; + struct io_kiocb *req; + + req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); + kasan_unpoison_object_data(req_cachep, req); + wq_stack_extract(&ctx->submit_state.free_list); + return req; } -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) { - struct io_wq_work_node *node; + if (unlikely(io_req_cache_empty(ctx))) { + if (!__io_alloc_req_refill(ctx)) + return false; + } + *req = io_extract_req(ctx); + return true; +} - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); +static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) +{ + return likely(ctx->submitter_task == current); } static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 15602a136821..8803c0979e2a 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -13,6 +13,11 @@ #include "filetable.h" #include "msg_ring.h" + +/* All valid masks for MSG_RING */ +#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ + IORING_MSG_RING_FLAGS_PASS) + struct io_msg { struct file *file; struct file *src_file; @@ -21,7 +26,10 @@ struct io_msg { u32 len; u32 cmd; u32 src_fd; - u32 dst_fd; + union { + u32 dst_fd; + u32 cqe_flags; + }; u32 flags; }; @@ -91,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head) if (current->flags & PF_EXITING) { ret = -EOWNERDEAD; } else { + u32 flags = 0; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + /* * If the target ring is using IOPOLL mode, then we need to be * holding the uring_lock for posting completions. Other ring @@ -99,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head) */ if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&target_ctx->uring_lock); @@ -114,9 +127,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + u32 flags = 0; int ret; - if (msg->src_fd || msg->dst_fd || msg->flags) + if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS) + return -EINVAL; + if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; @@ -124,15 +140,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (io_msg_need_remote(target_ctx)) return io_msg_exec_remote(req, io_msg_tw_complete); + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; io_double_unlock_ctx(target_ctx); } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; } return ret; @@ -241,7 +260,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) msg->src_fd = READ_ONCE(sqe->addr3); msg->dst_fd = READ_ONCE(sqe->file_index); msg->flags = READ_ONCE(sqe->msg_ring_flags); - if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) + if (msg->flags & ~IORING_MSG_RING_MASK) return -EINVAL; return 0; diff --git a/io_uring/net.c b/io_uring/net.c index 90326b279965..cbd4b725f58c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -90,6 +90,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; shutdown->how = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -99,8 +100,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -182,7 +182,7 @@ static int io_setup_async_msg(struct io_kiocb *req, if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; /* if were using fast_iov, set it to the new one */ - if (!kmsg->free_iov) { + if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; } @@ -345,7 +345,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned flags; int min_ret = 0; @@ -379,7 +378,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - ret = import_single_range(ITER_SOURCE, sr->buf, sr->len, &iov, &msg.msg_iter); + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); if (unlikely(ret)) return ret; @@ -775,10 +774,7 @@ retry_multishot: } } - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = len; - iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, kmsg->fast_iov, 1, - len); + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); } flags = sr->msg_flags; @@ -846,7 +842,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; - struct iovec iov; unsigned int cflags; unsigned flags; int ret, min_ret = 0; @@ -874,7 +869,7 @@ retry_multishot: sr->buf = buf; } - ret = import_single_range(ITER_DEST, sr->buf, len, &iov, &msg.msg_iter); + ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); if (unlikely(ret)) goto out_free; @@ -1085,7 +1080,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1127,8 +1121,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) msg.sg_from_iter = io_sg_from_iter; } else { io_notif_set_extended(zc->notif); - ret = import_single_range(ITER_SOURCE, zc->buf, zc->len, &iov, - &msg.msg_iter); + ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); if (unlikely(ret)) return ret; ret = io_notif_account_mem(zc->notif, zc->len); diff --git a/io_uring/notif.c b/io_uring/notif.c index c4bb793ebf0e..09dfd0832d19 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) struct io_kiocb *notif; struct io_notif_data *nd; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; - notif = io_alloc_req(ctx); notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3aa0d65c50e3..cca7c5b55208 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -46,11 +46,10 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -const struct io_op_def io_op_defs[] = { +const struct io_issue_def io_issue_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, - .name = "NOP", .prep = io_nop_prep, .issue = io_nop, }, @@ -64,13 +63,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READV", .prep = io_prep_rw, .issue = io_read, - .prep_async = io_readv_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -82,18 +76,12 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITEV", .prep = io_prep_rw, .issue = io_write, - .prep_async = io_writev_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, - .name = "FSYNC", .prep = io_fsync_prep, .issue = io_fsync, }, @@ -106,11 +94,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ_FIXED", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -122,30 +107,24 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE_FIXED", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "POLL_ADD", .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, - .name = "POLL_REMOVE", .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, - .name = "SYNC_FILE_RANGE", .prep = io_sfr_prep, .issue = io_sync_file_range, }, @@ -155,14 +134,9 @@ const struct io_op_def io_op_defs[] = { .pollout = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, - .prep_async = io_sendmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif @@ -174,29 +148,21 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .ioprio = 1, .manual_alloc = 1, - .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, - .prep_async = io_recvmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "TIMEOUT", .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, - .name = "TIMEOUT_REMOVE", .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, @@ -206,7 +172,6 @@ const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ - .name = "ACCEPT", #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, @@ -216,14 +181,11 @@ const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, - .name = "ASYNC_CANCEL", .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "LINK_TIMEOUT", .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -231,46 +193,36 @@ const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .name = "CONNECT", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, - .prep_async = io_connect_prep_async, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .name = "FALLOCATE", .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { - .name = "OPENAT", .prep = io_openat_prep, .issue = io_openat, - .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { - .name = "CLOSE", .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, - .name = "FILES_UPDATE", .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, - .name = "STATX", .prep = io_statx_prep, .issue = io_statx, - .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .needs_file = 1, @@ -282,11 +234,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -298,21 +247,17 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, - .name = "FADVISE", .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { - .name = "MADVISE", + .audit_skip = 1, .prep = io_madvise_prep, .issue = io_madvise, }, @@ -323,13 +268,9 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, - .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #else .prep = io_eopnotsupp_prep, #endif @@ -341,25 +282,20 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, - .name = "RECV", #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_OPENAT2] = { - .name = "OPENAT2", .prep = io_openat2_prep, .issue = io_openat2, - .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "EPOLL", #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, @@ -372,21 +308,18 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "SPLICE", .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "PROVIDE_BUFFERS", .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "REMOVE_BUFFERS", .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, @@ -395,13 +328,11 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "TEE", .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, - .name = "SHUTDOWN", #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, @@ -410,72 +341,51 @@ const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_RENAMEAT] = { - .name = "RENAMEAT", .prep = io_renameat_prep, .issue = io_renameat, - .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { - .name = "UNLINKAT", .prep = io_unlinkat_prep, .issue = io_unlinkat, - .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { - .name = "MKDIRAT", .prep = io_mkdirat_prep, .issue = io_mkdirat, - .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { - .name = "SYMLINKAT", .prep = io_symlinkat_prep, .issue = io_symlinkat, - .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { - .name = "LINKAT", .prep = io_linkat_prep, .issue = io_linkat, - .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, - .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, - .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, - .name = "FSETXATTR", .prep = io_fsetxattr_prep, .issue = io_fsetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { - .name = "SETXATTR", .prep = io_setxattr_prep, .issue = io_setxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, - .name = "FGETXATTR", .prep = io_fgetxattr_prep, .issue = io_fgetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { - .name = "GETXATTR", .prep = io_getxattr_prep, .issue = io_getxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, - .name = "SOCKET", #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, @@ -486,16 +396,12 @@ const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, - .name = "URING_CMD", .iopoll = 1, .iopoll_queue = 1, - .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { - .name = "SEND_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, @@ -503,32 +409,243 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, - .prep_async = io_send_prep_async, - .cleanup = io_send_zc_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SENDMSG_ZC] = { - .name = "SENDMSG_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, +}; + + +const struct io_cold_def io_cold_defs[] = { + [IORING_OP_NOP] = { + .name = "NOP", + }, + [IORING_OP_READV] = { + .async_size = sizeof(struct io_async_rw), + .name = "READV", + .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", + .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_FSYNC] = { + .name = "FSYNC", + }, + [IORING_OP_READ_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_POLL_ADD] = { + .name = "POLL_ADD", + }, + [IORING_OP_POLL_REMOVE] = { + .name = "POLL_REMOVE", + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .name = "SYNC_FILE_RANGE", + }, + [IORING_OP_SENDMSG] = { + .name = "SENDMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_sendmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_RECVMSG] = { + .name = "RECVMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_recvmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", + }, + [IORING_OP_TIMEOUT_REMOVE] = { + .name = "TIMEOUT_REMOVE", + }, + [IORING_OP_ACCEPT] = { + .name = "ACCEPT", + }, + [IORING_OP_ASYNC_CANCEL] = { + .name = "ASYNC_CANCEL", + }, + [IORING_OP_LINK_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", + }, + [IORING_OP_CONNECT] = { + .name = "CONNECT", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_connect), + .prep_async = io_connect_prep_async, +#endif + }, + [IORING_OP_FALLOCATE] = { + .name = "FALLOCATE", + }, + [IORING_OP_OPENAT] = { + .name = "OPENAT", + .cleanup = io_open_cleanup, + }, + [IORING_OP_CLOSE] = { + .name = "CLOSE", + }, + [IORING_OP_FILES_UPDATE] = { + .name = "FILES_UPDATE", + }, + [IORING_OP_STATX] = { + .name = "STATX", + .cleanup = io_statx_cleanup, + }, + [IORING_OP_READ] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE", + .fail = io_rw_fail, + }, + [IORING_OP_FADVISE] = { + .name = "FADVISE", + }, + [IORING_OP_MADVISE] = { + .name = "MADVISE", + }, + [IORING_OP_SEND] = { + .name = "SEND", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .fail = io_sendrecv_fail, + .prep_async = io_send_prep_async, +#endif + }, + [IORING_OP_RECV] = { + .name = "RECV", +#if defined(CONFIG_NET) + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_OPENAT2] = { + .name = "OPENAT2", + .cleanup = io_open_cleanup, + }, + [IORING_OP_EPOLL_CTL] = { + .name = "EPOLL", + }, + [IORING_OP_SPLICE] = { + .name = "SPLICE", + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .name = "PROVIDE_BUFFERS", + }, + [IORING_OP_REMOVE_BUFFERS] = { + .name = "REMOVE_BUFFERS", + }, + [IORING_OP_TEE] = { + .name = "TEE", + }, + [IORING_OP_SHUTDOWN] = { + .name = "SHUTDOWN", + }, + [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", + .cleanup = io_renameat_cleanup, + }, + [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", + .cleanup = io_unlinkat_cleanup, + }, + [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", + .cleanup = io_mkdirat_cleanup, + }, + [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_LINKAT] = { + .name = "LINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_MSG_RING] = { + .name = "MSG_RING", + .cleanup = io_msg_ring_cleanup, + }, + [IORING_OP_FSETXATTR] = { + .name = "FSETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SETXATTR] = { + .name = "SETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_FGETXATTR] = { + .name = "FGETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_GETXATTR] = { + .name = "GETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SOCKET] = { + .name = "SOCKET", + }, + [IORING_OP_URING_CMD] = { + .name = "URING_CMD", + .async_size = uring_cmd_pdu_size(1), + .prep_async = io_uring_cmd_prep_async, + }, + [IORING_OP_SEND_ZC] = { + .name = "SEND_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_send_prep_async, + .cleanup = io_send_zc_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_SENDMSG_ZC] = { + .name = "SENDMSG_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, -#else - .prep = io_eopnotsupp_prep, #endif }, }; @@ -536,7 +653,7 @@ const struct io_op_def io_op_defs[] = { const char *io_uring_get_opcode(u8 opcode) { if (opcode < IORING_OP_LAST) - return io_op_defs[opcode].name; + return io_cold_defs[opcode].name; return "INVALID"; } @@ -544,12 +661,13 @@ void __init io_uring_optable_init(void) { int i; - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { - BUG_ON(!io_op_defs[i].prep); - if (io_op_defs[i].prep != io_eopnotsupp_prep) - BUG_ON(!io_op_defs[i].issue); - WARN_ON_ONCE(!io_op_defs[i].name); + for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) { + BUG_ON(!io_issue_defs[i].prep); + if (io_issue_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_issue_defs[i].issue); + WARN_ON_ONCE(!io_cold_defs[i].name); } } diff --git a/io_uring/opdef.h b/io_uring/opdef.h index df7e13d9bfba..c22c8696e749 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -2,7 +2,7 @@ #ifndef IOU_OP_DEF_H #define IOU_OP_DEF_H -struct io_op_def { +struct io_issue_def { /* needs req->file assigned */ unsigned needs_file : 1; /* should block plug */ @@ -29,19 +29,24 @@ struct io_op_def { unsigned iopoll_queue : 1; /* opcode specific path will handle ->async_data allocation if needed */ unsigned manual_alloc : 1; + + int (*issue)(struct io_kiocb *, unsigned int); + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); +}; + +struct io_cold_def { /* size of async data needed, if any */ unsigned short async_size; const char *name; - int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); - int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; -extern const struct io_op_def io_op_defs[]; +extern const struct io_issue_def io_issue_defs[]; +extern const struct io_cold_def io_cold_defs[]; void io_uring_optable_init(void); #endif diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 67178e4bb282..a1b98c81a52d 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,15 @@ struct io_close { u32 file_slot; }; +static bool io_openat_force_async(struct io_open *open) +{ + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN + */ + return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE); +} + static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); @@ -61,6 +70,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; + if (io_openat_force_async(open)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -108,12 +119,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; + WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } diff --git a/io_uring/poll.c b/io_uring/poll.c index 2ac1366adbd7..8339a92b4510 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -678,7 +678,7 @@ alloc_apoll: int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; diff --git a/io_uring/rw.c b/io_uring/rw.c index 9c3ddd46a1ad..4c233910e200 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -391,7 +391,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, rw->len = sqe_len; } - ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); + ret = import_ubuf(ddir, buf, sqe_len, iter); if (ret) return ERR_PTR(ret); return NULL; @@ -410,7 +410,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, unsigned int issue_flags) { *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) + if (IS_ERR(*iovec)) return PTR_ERR(*iovec); iov_iter_save_state(&s->iter, &s->iter_state); @@ -450,7 +450,10 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) struct iovec iovec; ssize_t nr; - if (!iov_iter_is_bvec(iter)) { + if (iter_is_ubuf(iter)) { + iovec.iov_base = iter->ubuf + iter->iov_offset; + iovec.iov_len = iov_iter_count(iter); + } else if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { iovec.iov_base = u64_to_user_ptr(rw->addr); @@ -495,7 +498,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, io->free_iovec = iovec; io->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) + if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) return; if (!iovec) { unsigned iov_off = 0; @@ -516,7 +519,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { - if (!force && !io_op_defs[req->opcode].prep_async) + if (!force && !io_cold_defs[req->opcode].prep_async) return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; diff --git a/io_uring/splice.c b/io_uring/splice.c index 53e4232d0866..2a4bbb719531 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); @@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 559652380672..0119d3f1a556 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -312,7 +312,7 @@ static int io_sq_thread(void *data) do_exit(0); } -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { DEFINE_WAIT(wait); @@ -327,7 +327,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; } __cold int io_sq_offload_create(struct io_ring_ctx *ctx, diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 0c3fbcd1f583..e1b8d508d22d 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -26,4 +26,4 @@ void io_sq_thread_stop(struct io_sq_data *sqd); void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); diff --git a/io_uring/statx.c b/io_uring/statx.c index d8fc933d3f59..abb874209caa 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); diff --git a/io_uring/sync.c b/io_uring/sync.c index 64e87ea2b8fb..255f68c37e55 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_FORCE_ASYNC; + return 0; } @@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) int ret; /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); @@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); @@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 6201a9f442c6..e1c810e0b85a 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_getxattr(mnt_idmap(req->file->f_path.mnt), req->file->f_path.dentry, @@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); @@ -174,6 +173,7 @@ static int __io_setxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -222,8 +222,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = __io_setxattr(req, issue_flags, &req->file->f_path); io_xattr_finish(req, ret); @@ -237,8 +236,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); |