diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/cancel.c | 60 | ||||
-rw-r--r-- | io_uring/cancel.h | 3 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 18 | ||||
-rw-r--r-- | io_uring/io-wq.c | 70 | ||||
-rw-r--r-- | io_uring/io-wq.h | 2 | ||||
-rw-r--r-- | io_uring/io_uring.c | 312 | ||||
-rw-r--r-- | io_uring/io_uring.h | 79 | ||||
-rw-r--r-- | io_uring/kbuf.c | 6 | ||||
-rw-r--r-- | io_uring/net.c | 8 | ||||
-rw-r--r-- | io_uring/openclose.c | 6 | ||||
-rw-r--r-- | io_uring/poll.c | 21 | ||||
-rw-r--r-- | io_uring/rsrc.c | 14 | ||||
-rw-r--r-- | io_uring/rsrc.h | 3 | ||||
-rw-r--r-- | io_uring/rw.c | 82 | ||||
-rw-r--r-- | io_uring/splice.c | 4 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 15 | ||||
-rw-r--r-- | io_uring/sqpoll.h | 1 | ||||
-rw-r--r-- | io_uring/timeout.c | 20 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 33 |
19 files changed, 430 insertions, 327 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 58c46c852bdd..7b23607cf4af 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -22,35 +22,56 @@ struct io_cancel { u64 addr; u32 flags; s32 fd; + u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ + IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) -static bool io_cancel_cb(struct io_wq_work *work, void *data) +/* + * Returns true if the request matches the criteria outlined by 'cd'. + */ +bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) { - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_cancel_data *cd = data; + bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; if (req->ctx != cd->ctx) return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + + if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) + match_user_data = true; + + if (cd->flags & IORING_ASYNC_CANCEL_ANY) + goto check_seq; + if (cd->flags & IORING_ASYNC_CANCEL_FD) { if (req->file != cd->file) return false; - } else { - if (req->cqe.user_data != cd->data) + } + if (cd->flags & IORING_ASYNC_CANCEL_OP) { + if (req->opcode != cd->opcode) return false; } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (match_user_data && req->cqe.user_data != cd->data) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { +check_seq: if (cd->seq == req->work.cancel_seq) return false; req->work.cancel_seq = cd->seq; } + return true; } +static bool io_cancel_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; + + return io_cancel_req_match(req, cd); +} + static int io_async_cancel_one(struct io_uring_task *tctx, struct io_cancel_data *cd) { @@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) + if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); @@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } + if (cancel->flags & IORING_ASYNC_CANCEL_OP) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + cancel->opcode = READ_ONCE(sqe->len); + } return 0; } @@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, + .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->task->io_uring; @@ -238,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) struct io_uring_sync_cancel_reg sc; struct fd f = { }; DEFINE_WAIT(wait); - int ret; + int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; - if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) - return -EINVAL; + for (i = 0; i < ARRAY_SIZE(sc.pad); i++) + if (sc.pad[i]) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) + if (sc.pad2[i]) + return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; + cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 6a59ee484d0c..fc98622e6166 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -8,11 +8,11 @@ struct io_cancel_data { u64 data; struct file *file; }; + u8 opcode; u32 flags; int seq; }; - int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); @@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); +bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 76c279b13aee..300455b4bc12 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -46,9 +46,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { + struct io_ring_ctx *ctx = f->private_data; struct io_sq_data *sq = NULL; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; @@ -203,14 +207,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } - -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} #endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 399e9a15c38d..62f345587df5 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -232,17 +232,25 @@ static void io_worker_exit(struct io_worker *worker) do_exit(0); } -static inline bool io_acct_run_queue(struct io_wq_acct *acct) +static inline bool __io_acct_run_queue(struct io_wq_acct *acct) { - bool ret = false; + return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) && + !wq_list_empty(&acct->work_list); +} +/* + * If there's work to do, returns true with acct->lock acquired. If not, + * returns false with no lock held. + */ +static inline bool io_acct_run_queue(struct io_wq_acct *acct) + __acquires(&acct->lock) +{ raw_spin_lock(&acct->lock); - if (!wq_list_empty(&acct->work_list) && - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - ret = true; - raw_spin_unlock(&acct->lock); + if (__io_acct_run_queue(acct)) + return true; - return ret; + raw_spin_unlock(&acct->lock); + return false; } /* @@ -268,11 +276,14 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, io_worker_release(worker); continue; } - if (wake_up_process(worker->task)) { - io_worker_release(worker); - return true; - } + /* + * If the worker is already running, it's either already + * starting work or finishing work. In either case, if it does + * to go sleep, we'll kick off a new task for this work anyway. + */ + wake_up_process(worker->task); io_worker_release(worker); + return true; } return false; @@ -397,6 +408,7 @@ static void io_wq_dec_running(struct io_worker *worker) if (!io_acct_run_queue(acct)) return; + raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); io_queue_worker_create(worker, acct, create_worker_cb); @@ -521,9 +533,13 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_unlock(&worker->lock); } -static void io_worker_handle_work(struct io_worker *worker) +/* + * Called with acct->lock held, drops it before returning + */ +static void io_worker_handle_work(struct io_wq_acct *acct, + struct io_worker *worker) + __releases(&acct->lock) { - struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); @@ -537,7 +553,6 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ - raw_spin_lock(&acct->lock); work = io_get_next_work(acct, worker); raw_spin_unlock(&acct->lock); if (work) { @@ -591,6 +606,10 @@ static void io_worker_handle_work(struct io_worker *worker) wake_up(&wq->hash->wait); } } while (work); + + if (!__io_acct_run_queue(acct)) + break; + raw_spin_lock(&acct->lock); } while (1); } @@ -611,8 +630,13 @@ static int io_wq_worker(void *data) long ret; set_current_state(TASK_INTERRUPTIBLE); + + /* + * If we have work to do, io_acct_run_queue() returns with + * the acct->lock held. If not, it will drop it. + */ while (io_acct_run_queue(acct)) - io_worker_handle_work(worker); + io_worker_handle_work(acct, worker); raw_spin_lock(&wq->lock); /* @@ -645,8 +669,8 @@ static int io_wq_worker(void *data) } } - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_worker_handle_work(worker); + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct)) + io_worker_handle_work(acct, worker); io_worker_exit(worker); return 0; @@ -909,13 +933,10 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - raw_spin_lock(&wq->lock); rcu_read_lock(); do_create = !io_wq_activate_free_worker(wq, acct); rcu_read_unlock(); - raw_spin_unlock(&wq->lock); - if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running))) { bool did_create; @@ -1285,13 +1306,16 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) return __io_wq_cpu_online(wq, cpu, false); } -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) +int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + if (!tctx || !tctx->io_wq) + return -EINVAL; + rcu_read_lock(); if (mask) - cpumask_copy(wq->cpu_mask, mask); + cpumask_copy(tctx->io_wq->cpu_mask, mask); else - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); rcu_read_unlock(); return 0; diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 31228426d192..06d9ca90c577 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -50,7 +50,7 @@ void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e8096d502a7c..e7675355048d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -147,8 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_queue_sqe(struct io_kiocb *req); -static void io_move_task_work_from_local(struct io_ring_ctx *ctx); -static void __io_submit_flush_completions(struct io_ring_ctx *ctx); struct kmem_cache *req_cachep; @@ -229,7 +227,6 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); - kasan_poison_object_data(req_cachep, req); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -292,13 +289,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) goto err; - - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); - if (!ctx->dummy_ubuf) - goto err; - /* set invalid range, so io_import_fixed() fails meeting it */ - ctx->dummy_ubuf->ubuf = -1UL; - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; @@ -337,7 +327,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: - kfree(ctx->dummy_ubuf); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); @@ -626,7 +615,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->task_complete) + if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } @@ -639,19 +628,14 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - - if (ctx->task_complete) { - /* - * ->task_complete implies that only current might be waiting - * for CQEs, and obviously, we currently don't. No one is - * waiting, wakeups are futile, skip them. - */ - io_commit_cqring_flush(ctx); - } else { - spin_unlock(&ctx->completion_lock); - io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); + if (!ctx->task_complete) { + if (!ctx->lockless_cq) + spin_unlock(&ctx->completion_lock); + /* IOPOLL rings only need to wake up if it's also SQPOLL */ + if (!ctx->syscall_iopoll) + io_cqring_wake(ctx); } + io_commit_cqring_flush(ctx); } static void io_cq_unlock_post(struct io_ring_ctx *ctx) @@ -659,8 +643,8 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - io_commit_cqring_flush(ctx); io_cqring_wake(ctx); + io_commit_cqring_flush(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -693,10 +677,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); + struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!cqe) + if (!io_get_cqe_overflow(ctx, &cqe, true)) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); @@ -815,15 +799,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -bool io_req_cqe_overflow(struct io_kiocb *req) +void io_req_cqe_overflow(struct io_kiocb *req) { - if (!(req->flags & REQ_F_CQE32_INIT)) { - req->extra1 = 0; - req->extra2 = 0; - } - return io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->extra1, req->extra2); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, req->big_cqe.extra2); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } /* @@ -831,7 +812,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -843,7 +824,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) - return NULL; + return false; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); @@ -851,7 +832,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (!len) - return NULL; + return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; @@ -860,12 +841,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) - ctx->cqe_cached++; - return &rings->cqes[off]; + return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, @@ -880,8 +856,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { + if (likely(io_get_cqe(ctx, &cqe))) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); WRITE_ONCE(cqe->user_data, user_data); @@ -905,7 +880,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &state->cqes[i]; + struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { if (ctx->task_complete) { @@ -941,19 +916,22 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } -bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, - bool allow_overflow) +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); + return __io_post_aux_cqe(ctx, user_data, res, cflags, false); lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ @@ -964,10 +942,10 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, * however it's main job is to prevent unbounded posted completions, * and in that it works just as well. */ - if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) return false; - cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; + cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; cqe->user_data = user_data; cqe->res = res; cqe->flags = cflags; @@ -980,8 +958,10 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) - io_fill_cqe_req(ctx, req); + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!io_fill_cqe_req(ctx, req)) + io_req_cqe_overflow(req); + } /* * If we're the last reference to this request, add to our locked @@ -999,8 +979,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) io_put_kbuf_comp(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); + io_put_file(req); rsrc_node = req->rsrc_node; /* @@ -1062,7 +1041,8 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->cqe.res = 0; + memset(&req->cqe, 0, sizeof(req->cqe)); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -1507,7 +1487,8 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { do { @@ -1534,8 +1515,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); + io_put_file(req); io_req_put_rsrc_locked(req, ctx); @@ -1545,7 +1525,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) } while (node); } -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) +void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; @@ -1560,7 +1540,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP) && - unlikely(!__io_fill_cqe_req(ctx, req))) { + unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->task_complete) { spin_lock(&ctx->completion_lock); io_req_cqe_overflow(req); @@ -1616,7 +1596,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; - int ret = 0; unsigned long check_cq; if (!io_allowed_run_tw(ctx)) @@ -1642,6 +1621,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; do { + int ret = 0; + /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets @@ -1670,13 +1651,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; } ret = io_do_iopoll(ctx, !min); - if (ret < 0) + if (unlikely(ret < 0)) + return ret; + + if (task_sigpending(current)) + return -EINTR; + if (need_resched()) break; + nr_events += ret; - ret = 0; - } while (nr_events < min && !need_resched()); + } while (nr_events < min); - return ret; + return 0; } void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -1948,6 +1934,14 @@ fail: ret = io_issue_sqe(req, issue_flags); if (ret != -EAGAIN) break; + + /* + * If REQ_F_NOWAIT is set, then don't wait or retry with + * poll. -EAGAIN is final for that case. + */ + if (req->flags & REQ_F_NOWAIT) + break; + /* * We can get EAGAIN for iopolled IO even though we're * forcing a sync submission from here, since we can't @@ -2353,8 +2347,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { - unsigned head, mask = ctx->sq_entries - 1; - unsigned sq_idx = ctx->cached_sq_head++ & mask; + unsigned mask = ctx->sq_entries - 1; + unsigned head = ctx->cached_sq_head++ & mask; + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + head = READ_ONCE(ctx->sq_array[head]); + if (unlikely(head >= ctx->sq_entries)) { + /* drop invalid entries */ + spin_lock(&ctx->completion_lock); + ctx->cq_extra--; + spin_unlock(&ctx->completion_lock); + WRITE_ONCE(ctx->rings->sq_dropped, + READ_ONCE(ctx->rings->sq_dropped) + 1); + return false; + } + } /* * The cached sq head (or cq tail) serves two purposes: @@ -2364,20 +2371,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) { - /* double index for 128-byte SQEs, twice as long */ - if (ctx->flags & IORING_SETUP_SQE128) - head <<= 1; - *sqe = &ctx->sq_sqes[head]; - return true; - } - /* drop invalid entries */ - ctx->cq_extra--; - WRITE_ONCE(ctx->rings->sq_dropped, - READ_ONCE(ctx->rings->sq_dropped) + 1); - return false; + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; + *sqe = &ctx->sq_sqes[head]; + return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2476,19 +2475,30 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) if (!llist_empty(&ctx->work_llist)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx) > 0) - return 1; + return 0; } if (io_run_task_work() > 0) - return 1; + return 0; if (task_sigpending(current)) return -EINTR; return 0; } +static bool current_pending_io(void) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} + /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { + int io_wait, ret; + if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(!llist_empty(&ctx->work_llist))) @@ -2499,11 +2509,22 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; + + /* + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. + */ + io_wait = current->in_iowait; + if (current_pending_io()) + current->in_iowait = 1; + ret = 0; if (iowq->timeout == KTIME_MAX) schedule(); else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - return -ETIME; - return 0; + ret = -ETIME; + current->in_iowait = io_wait; + return ret; } /* @@ -2613,14 +2634,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, static void io_mem_free(void *ptr) { - struct page *page; - if (!ptr) return; - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); + folio_put(virt_to_folio(ptr)); } static void io_pages_free(struct page ***pages, int npages) @@ -2735,6 +2752,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif + if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (sq_offset) + *sq_offset = SIZE_MAX; + return off; + } + if (sq_offset) *sq_offset = off; @@ -2877,7 +2900,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -3418,8 +3440,6 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); - struct vm_unmapped_area_info info; void *ptr; /* @@ -3434,32 +3454,29 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, if (IS_ERR(ptr)) return -ENOMEM; - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR - info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL); + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; #else - info.align_mask = PAGE_MASK & (SHMLBA - 1UL); + addr = 0UL; #endif - info.align_offset = (unsigned long) ptr; - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - addr = vm_unmapped_area(&info); - if (offset_in_page(addr)) { - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = mmap_end; - addr = vm_unmapped_area(&info); - } - - return addr; + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ @@ -3712,7 +3729,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return PTR_ERR(rings); ctx->rings = rings; - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; @@ -3841,6 +3859,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->lockless_cq = true; + /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() @@ -3859,7 +3880,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->syscall_iopoll = 1; ctx->compat = in_compat_syscall(); - if (!capable(CAP_IPC_LOCK)) + if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* @@ -3920,7 +3941,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; p->sq_off.resv1 = 0; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) p->sq_off.user_addr = 0; @@ -4009,7 +4031,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | + IORING_SETUP_NO_SQARRAY)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -4172,16 +4195,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return 0; } +static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, + cpumask_var_t new_mask) +{ + int ret; + + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + ret = io_wq_cpu_affinity(current->io_uring, new_mask); + } else { + mutex_unlock(&ctx->uring_lock); + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); + mutex_lock(&ctx->uring_lock); + } + + return ret; +} + static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { - struct io_uring_task *tctx = current->io_uring; cpumask_var_t new_mask; int ret; - if (!tctx || !tctx->io_wq) - return -EINVAL; - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; @@ -4202,19 +4237,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, return -EFAULT; } - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); + ret = __io_register_iowq_aff(ctx, new_mask); free_cpumask_var(new_mask); return ret; } static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - return io_wq_cpu_affinity(tctx->io_wq, NULL); + return __io_register_iowq_aff(ctx, NULL); } static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, @@ -4590,8 +4620,20 @@ static int __init io_uring_init(void) io_uring_optable_init(); - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + /* + * Allow user copy in the per-command field, which starts after the + * file in io_kiocb and until the opcode field. The openat2 handling + * requires copying in user memory into the io_kiocb object in that + * range, and HARDENED_USERCOPY will complain if we haven't + * correctly annotated this range. + */ + req_cachep = kmem_cache_create_usercopy("io_kiocb", + sizeof(struct io_kiocb), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, + offsetof(struct io_kiocb, cmd.data), + sizeof_field(struct io_kiocb, cmd.data), NULL); + return 0; }; __initcall(io_uring_init); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d3606d30cf6f..547c30582fb8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -38,14 +38,13 @@ enum { IOU_STOP_MULTISHOT = -ECANCELED, }; -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); -bool io_req_cqe_overflow(struct io_kiocb *req); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, - bool allow_overflow); +bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -73,7 +72,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +void __io_submit_flush_completions(struct io_ring_ctx *ctx); int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); @@ -110,31 +109,31 @@ static inline void io_req_task_work_add(struct io_kiocb *req) #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, - bool overflow) +static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, + struct io_uring_cqe **ret, + bool overflow) { io_lockdep_assert_cq_locked(ctx); - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) - ctx->cqe_cached++; - return cqe; + if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + return false; } - - return __io_get_cqe(ctx, overflow); + *ret = ctx->cqe_cached; + ctx->cached_cq_tail++; + ctx->cqe_cached++; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; + return true; } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) { - return io_get_cqe_overflow(ctx, false); + return io_get_cqe_overflow(ctx, ret, false); } -static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) { struct io_uring_cqe *cqe; @@ -143,39 +142,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqe(ctx); - if (unlikely(!cqe)) + if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { - u64 extra1 = 0, extra2 = 0; - - if (req->flags & REQ_F_CQE32_INIT) { - extra1 = req->extra1; - extra2 = req->extra2; - } - - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); + memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } return true; } -static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (likely(__io_fill_cqe_req(ctx, req))) - return true; - return io_req_cqe_overflow(req); -} - static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -196,10 +178,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } -static inline void io_put_file(struct file *file) +static inline void io_put_file(struct io_kiocb *req) { - if (file) - fput(file); + if (!(req->flags & REQ_F_FIXED_FILE) && req->file) + fput(req->file); } static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, @@ -354,7 +336,6 @@ static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); - kasan_unpoison_object_data(req_cachep, req); wq_stack_extract(&ctx->submit_state.free_list); return req; } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2f0181521c98..556f4df25b0f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -218,11 +218,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { - struct page *page; - - page = virt_to_head_page(bl->buf_ring); - if (put_page_testzero(page)) - free_compound_page(page); + folio_put(virt_to_folio(bl->buf_ring)); bl->buf_ring = NULL; bl->is_mmap = 0; } else if (bl->buf_nr_pages) { diff --git a/io_uring/net.c b/io_uring/net.c index eb1f51ddcb23..3d07bf79c1e0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -641,8 +641,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } if (!mshot_finished) { - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE, true)) { + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + *ret, cflags | IORING_CQE_F_MORE)) { io_recv_prep_retry(req); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || @@ -1366,8 +1366,8 @@ retry: if (ret < 0) return ret; - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, - IORING_CQE_F_MORE, true)) + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + ret, IORING_CQE_F_MORE)) goto retry; return -ECANCELED; diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 10ca57f5bd24..e3fae26e025d 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -35,9 +35,11 @@ static bool io_openat_force_async(struct io_open *open) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN + * it'll always -EAGAIN. Note that we test for __O_TMPFILE because + * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force + * async for. */ - return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE); + return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE); } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/poll.c b/io_uring/poll.c index d4597efe14a7..4c360ba8793a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req, ts->locked, mask, - IORING_CQE_F_MORE, false)) { + if (!io_fill_cqe_req_aux(req, ts->locked, mask, + IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -824,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - *out_bucket = hb; - return req; + if (io_cancel_req_match(req, cd)) { + *out_bucket = hb; + return req; + } } spin_unlock(&hb->lock); } @@ -855,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, struct io_hash_bucket *bucket; struct io_kiocb *req; - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | + IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd, table, &bucket); else req = io_poll_find(ctx, false, cd, table, &bucket); @@ -972,8 +969,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; + struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5e8fdd9b8ca6..d9c853d10587 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -33,6 +33,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +static const struct io_mapped_ubuf dummy_ubuf = { + /* set invalid range, so io_import_fixed() fails meeting it */ + .ubuf = -1UL, + .ubuf_end = 0, +}; + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -132,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; - if (imu != ctx->dummy_ubuf) { + if (imu != &dummy_ubuf) { for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) @@ -459,14 +465,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, break; i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { + if (ctx->user_bufs[i] != &dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } - ctx->user_bufs[i] = ctx->dummy_ubuf; + ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; } ctx->user_bufs[i] = imu; @@ -1077,7 +1083,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, int ret, nr_pages, i; struct folio *folio = NULL; - *pimu = ctx->dummy_ubuf; + *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) return 0; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0a8a95e9b99e..8625181fb87a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -54,10 +54,9 @@ struct io_mapped_ubuf { u64 ubuf_end; unsigned int nr_bvecs; unsigned long acct_pages; - struct bio_vec bvec[]; + struct bio_vec bvec[] __counted_by(nr_bvecs); }; -void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index 1bce2208b65c..c8c822fa7980 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) } else { rw->kiocb.ki_ioprio = get_current_ioprio(); } + rw->kiocb.dio_complete = NULL; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); @@ -220,17 +221,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void kiocb_end_write(struct io_kiocb *req) +static void io_req_end_write(struct io_kiocb *req) { - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); + kiocb_end_write(&rw->kiocb); } } @@ -243,7 +239,7 @@ static void io_req_io_end(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); if (rw->kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); + io_req_end_write(req); fsnotify_modify(req->file); } else { fsnotify_access(req->file); @@ -285,6 +281,15 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct kiocb *kiocb = &rw->kiocb; + + if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { + long res = kiocb->dio_complete(rw->kiocb.private); + + io_req_set_res(req, io_fixup_rw_res(req, res), 0); + } + io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { @@ -300,9 +305,11 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, io_fixup_rw_res(req, res), 0); + if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, io_fixup_rw_res(req, res), 0); + } req->io_task_work.func = io_req_rw_complete; __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } @@ -313,7 +320,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; @@ -902,19 +909,18 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } + if (req->flags & REQ_F_ISREG) + kiocb_start_write(kiocb); + kiocb->ki_flags |= IOCB_WRITE; + /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. + * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler + * groks deferring the completion to task context. This isn't + * necessary and useful for polled IO as that can always complete + * directly. */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; + if (!(kiocb->ki_flags & IOCB_HIPRI)) + kiocb->ki_flags |= IOCB_DIO_CALLER_COMP; if (likely(req->file->f_op->write_iter)) ret2 = call_write_iter(req->file, kiocb, &s->iter); @@ -961,7 +967,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); return ret ? ret : -EAGAIN; } done: @@ -972,7 +978,7 @@ copy_iov: ret = io_setup_async_rw(req, iovec, s, false); if (!ret) { if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); + io_req_end_write(req); return -EAGAIN; } return ret; @@ -983,13 +989,6 @@ copy_iov: return ret; } -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - void io_rw_fail(struct io_kiocb *req) { int res; @@ -1060,24 +1059,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - req->cqe.flags = io_put_kbuf(req, 0); - if (unlikely(!__io_fill_cqe_req(ctx, req))) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } } - if (unlikely(!nr_events)) return 0; - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); pos = start ? start->next : ctx->iopoll_list.first; wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); + + if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) + return 0; + ctx->submit_state.compl_reqs.first = pos; + __io_submit_flush_completions(ctx); return nr_events; } diff --git a/io_uring/splice.c b/io_uring/splice.c index 2a4bbb719531..7c4469e9540e 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) ret = do_tee(in, out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); + fput(in); done: if (ret != sp->len) req_set_fail(req); @@ -112,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); + fput(in); done: if (ret != sp->len) req_set_fail(req); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 5e329e3cd470..ee2d2c687fda 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -421,3 +421,18 @@ err: io_sq_thread_finish(ctx); return ret; } + +__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, + cpumask_var_t mask) +{ + struct io_sq_data *sqd = ctx->sq_data; + int ret = -EINVAL; + + if (sqd) { + io_sq_thread_park(sqd); + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + io_sq_thread_unpark(sqd); + } + + return ret; +} diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index e1b8d508d22d..8df37e8c9149 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -27,3 +27,4 @@ void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); +int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index fb0547b35dcd..7fd7dbb211d6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { bool filled; - filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE, - false); + filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, + IORING_CQE_F_MORE); if (filled) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); @@ -268,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, list_for_each_entry(timeout, &ctx->timeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != tmp->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == tmp->work.cancel_seq) - continue; - tmp->work.cancel_seq = cd->seq; + if (io_cancel_req_match(tmp, cd)) { + req = tmp; + break; } - req = tmp; - break; } if (!req) return ERR_PTR(-ENOENT); @@ -409,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { - struct io_cancel_data cd = { .data = user_data, }; + struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; @@ -473,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) int ret; if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; + struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; spin_lock(&ctx->completion_lock); ret = io_timeout_cancel(ctx, &cd); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 476c7877ce58..537795fddc87 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -7,6 +7,7 @@ #include <linux/nospec.h> #include <uapi/linux/io_uring.h> +#include <uapi/asm-generic/ioctls.h> #include "io_uring.h" #include "rsrc.h" @@ -42,9 +43,8 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) { - req->extra1 = extra1; - req->extra2 = extra2; - req->flags |= REQ_F_CQE32_INIT; + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; } /* @@ -164,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return io_import_fixed(rw, iter, req->imu, ubuf, len); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->sqe->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); |