summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/cancel.c60
-rw-r--r--io_uring/cancel.h3
-rw-r--r--io_uring/fdinfo.c18
-rw-r--r--io_uring/io-wq.c70
-rw-r--r--io_uring/io-wq.h2
-rw-r--r--io_uring/io_uring.c312
-rw-r--r--io_uring/io_uring.h79
-rw-r--r--io_uring/kbuf.c6
-rw-r--r--io_uring/net.c8
-rw-r--r--io_uring/openclose.c6
-rw-r--r--io_uring/poll.c21
-rw-r--r--io_uring/rsrc.c14
-rw-r--r--io_uring/rsrc.h3
-rw-r--r--io_uring/rw.c82
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/sqpoll.c15
-rw-r--r--io_uring/sqpoll.h1
-rw-r--r--io_uring/timeout.c20
-rw-r--r--io_uring/uring_cmd.c33
19 files changed, 430 insertions, 327 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 58c46c852bdd..7b23607cf4af 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -22,35 +22,56 @@ struct io_cancel {
u64 addr;
u32 flags;
s32 fd;
+ u8 opcode;
};
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
- IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
+ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
+ IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)
-static bool io_cancel_cb(struct io_wq_work *work, void *data)
+/*
+ * Returns true if the request matches the criteria outlined by 'cd'.
+ */
+bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_cancel_data *cd = data;
+ bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;
if (req->ctx != cd->ctx)
return false;
- if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
- ;
- } else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+
+ if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
+ match_user_data = true;
+
+ if (cd->flags & IORING_ASYNC_CANCEL_ANY)
+ goto check_seq;
+ if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
- } else {
- if (req->cqe.user_data != cd->data)
+ }
+ if (cd->flags & IORING_ASYNC_CANCEL_OP) {
+ if (req->opcode != cd->opcode)
return false;
}
- if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+ if (match_user_data && req->cqe.user_data != cd->data)
+ return false;
+ if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+check_seq:
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
+
return true;
}
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_cancel_data *cd = data;
+
+ return io_cancel_req_match(req, cd);
+}
+
static int io_async_cancel_one(struct io_uring_task *tctx,
struct io_cancel_data *cd)
{
@@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
- if (sqe->off || sqe->len || sqe->splice_fd_in)
+ if (sqe->off || sqe->splice_fd_in)
return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr);
@@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd);
}
+ if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
+ if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
+ return -EINVAL;
+ cancel->opcode = READ_ONCE(sqe->len);
+ }
return 0;
}
@@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
.ctx = req->ctx,
.data = cancel->addr,
.flags = cancel->flags,
+ .opcode = cancel->opcode,
.seq = atomic_inc_return(&req->ctx->cancel_seq),
};
struct io_uring_task *tctx = req->task->io_uring;
@@ -238,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
DEFINE_WAIT(wait);
- int ret;
+ int ret, i;
if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL;
- if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
- return -EINVAL;
+ for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
+ if (sc.pad[i])
+ return -EINVAL;
+ for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
+ if (sc.pad2[i])
+ return -EINVAL;
cd.data = sc.addr;
cd.flags = sc.flags;
+ cd.opcode = sc.opcode;
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 6a59ee484d0c..fc98622e6166 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -8,11 +8,11 @@ struct io_cancel_data {
u64 data;
struct file *file;
};
+ u8 opcode;
u32 flags;
int seq;
};
-
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
@@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
+bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 76c279b13aee..300455b4bc12 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,9 +46,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
- struct seq_file *m)
+/*
+ * Caller holds a reference to the file already, we don't need to do
+ * anything else to get an extra reference.
+ */
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
+ struct io_ring_ctx *ctx = f->private_data;
struct io_sq_data *sq = NULL;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
@@ -203,14 +207,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
-
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct io_ring_ctx *ctx = f->private_data;
-
- if (percpu_ref_tryget(&ctx->refs)) {
- __io_uring_show_fdinfo(ctx, m);
- percpu_ref_put(&ctx->refs);
- }
-}
#endif
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 399e9a15c38d..62f345587df5 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -232,17 +232,25 @@ static void io_worker_exit(struct io_worker *worker)
do_exit(0);
}
-static inline bool io_acct_run_queue(struct io_wq_acct *acct)
+static inline bool __io_acct_run_queue(struct io_wq_acct *acct)
{
- bool ret = false;
+ return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) &&
+ !wq_list_empty(&acct->work_list);
+}
+/*
+ * If there's work to do, returns true with acct->lock acquired. If not,
+ * returns false with no lock held.
+ */
+static inline bool io_acct_run_queue(struct io_wq_acct *acct)
+ __acquires(&acct->lock)
+{
raw_spin_lock(&acct->lock);
- if (!wq_list_empty(&acct->work_list) &&
- !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- ret = true;
- raw_spin_unlock(&acct->lock);
+ if (__io_acct_run_queue(acct))
+ return true;
- return ret;
+ raw_spin_unlock(&acct->lock);
+ return false;
}
/*
@@ -268,11 +276,14 @@ static bool io_wq_activate_free_worker(struct io_wq *wq,
io_worker_release(worker);
continue;
}
- if (wake_up_process(worker->task)) {
- io_worker_release(worker);
- return true;
- }
+ /*
+ * If the worker is already running, it's either already
+ * starting work or finishing work. In either case, if it does
+ * to go sleep, we'll kick off a new task for this work anyway.
+ */
+ wake_up_process(worker->task);
io_worker_release(worker);
+ return true;
}
return false;
@@ -397,6 +408,7 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!io_acct_run_queue(acct))
return;
+ raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running);
atomic_inc(&wq->worker_refs);
io_queue_worker_create(worker, acct, create_worker_cb);
@@ -521,9 +533,13 @@ static void io_assign_current_work(struct io_worker *worker,
raw_spin_unlock(&worker->lock);
}
-static void io_worker_handle_work(struct io_worker *worker)
+/*
+ * Called with acct->lock held, drops it before returning
+ */
+static void io_worker_handle_work(struct io_wq_acct *acct,
+ struct io_worker *worker)
+ __releases(&acct->lock)
{
- struct io_wq_acct *acct = io_wq_get_acct(worker);
struct io_wq *wq = worker->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
@@ -537,7 +553,6 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
raw_spin_unlock(&acct->lock);
if (work) {
@@ -591,6 +606,10 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
+
+ if (!__io_acct_run_queue(acct))
+ break;
+ raw_spin_lock(&acct->lock);
} while (1);
}
@@ -611,8 +630,13 @@ static int io_wq_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * If we have work to do, io_acct_run_queue() returns with
+ * the acct->lock held. If not, it will drop it.
+ */
while (io_acct_run_queue(acct))
- io_worker_handle_work(worker);
+ io_worker_handle_work(acct, worker);
raw_spin_lock(&wq->lock);
/*
@@ -645,8 +669,8 @@ static int io_wq_worker(void *data)
}
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- io_worker_handle_work(worker);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct))
+ io_worker_handle_work(acct, worker);
io_worker_exit(worker);
return 0;
@@ -909,13 +933,10 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&acct->lock);
- raw_spin_lock(&wq->lock);
rcu_read_lock();
do_create = !io_wq_activate_free_worker(wq, acct);
rcu_read_unlock();
- raw_spin_unlock(&wq->lock);
-
if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))) {
bool did_create;
@@ -1285,13 +1306,16 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
return __io_wq_cpu_online(wq, cpu, false);
}
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
+int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
rcu_read_lock();
if (mask)
- cpumask_copy(wq->cpu_mask, mask);
+ cpumask_copy(tctx->io_wq->cpu_mask, mask);
else
- cpumask_copy(wq->cpu_mask, cpu_possible_mask);
+ cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
rcu_read_unlock();
return 0;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 31228426d192..06d9ca90c577 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -50,7 +50,7 @@ void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e8096d502a7c..e7675355048d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,8 +147,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_queue_sqe(struct io_kiocb *req);
-static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
struct kmem_cache *req_cachep;
@@ -229,7 +227,6 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
- kasan_poison_object_data(req_cachep, req);
}
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -292,13 +289,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
goto err;
-
- ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
- if (!ctx->dummy_ubuf)
- goto err;
- /* set invalid range, so io_import_fixed() fails meeting it */
- ctx->dummy_ubuf->ubuf = -1UL;
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
0, GFP_KERNEL))
goto err;
@@ -337,7 +327,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
- kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl);
@@ -626,7 +615,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{
- if (!ctx->task_complete)
+ if (!ctx->lockless_cq)
spin_lock(&ctx->completion_lock);
}
@@ -639,19 +628,14 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{
io_commit_cqring(ctx);
-
- if (ctx->task_complete) {
- /*
- * ->task_complete implies that only current might be waiting
- * for CQEs, and obviously, we currently don't. No one is
- * waiting, wakeups are futile, skip them.
- */
- io_commit_cqring_flush(ctx);
- } else {
- spin_unlock(&ctx->completion_lock);
- io_commit_cqring_flush(ctx);
- io_cqring_wake(ctx);
+ if (!ctx->task_complete) {
+ if (!ctx->lockless_cq)
+ spin_unlock(&ctx->completion_lock);
+ /* IOPOLL rings only need to wake up if it's also SQPOLL */
+ if (!ctx->syscall_iopoll)
+ io_cqring_wake(ctx);
}
+ io_commit_cqring_flush(ctx);
}
static void io_cq_unlock_post(struct io_ring_ctx *ctx)
@@ -659,8 +643,8 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
{
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
- io_commit_cqring_flush(ctx);
io_cqring_wake(ctx);
+ io_commit_cqring_flush(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -693,10 +677,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
- struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
+ struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe;
- if (!cqe)
+ if (!io_get_cqe_overflow(ctx, &cqe, true))
break;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
@@ -815,15 +799,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-bool io_req_cqe_overflow(struct io_kiocb *req)
+void io_req_cqe_overflow(struct io_kiocb *req)
{
- if (!(req->flags & REQ_F_CQE32_INIT)) {
- req->extra1 = 0;
- req->extra2 = 0;
- }
- return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- req->extra1, req->extra2);
+ io_cqring_event_overflow(req->ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ req->big_cqe.extra1, req->big_cqe.extra2);
+ memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
/*
@@ -831,7 +812,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
@@ -843,7 +824,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
* Force overflow the completion.
*/
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
- return NULL;
+ return false;
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
@@ -851,7 +832,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
if (!len)
- return NULL;
+ return false;
if (ctx->flags & IORING_SETUP_CQE32) {
off <<= 1;
@@ -860,12 +841,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
ctx->cqe_cached = &rings->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len;
-
- ctx->cached_cq_tail++;
- ctx->cqe_cached++;
- if (ctx->flags & IORING_SETUP_CQE32)
- ctx->cqe_cached++;
- return &rings->cqes[off];
+ return true;
}
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
@@ -880,8 +856,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
+ if (likely(io_get_cqe(ctx, &cqe))) {
trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
WRITE_ONCE(cqe->user_data, user_data);
@@ -905,7 +880,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
lockdep_assert_held(&ctx->uring_lock);
for (i = 0; i < state->cqes_count; i++) {
- struct io_uring_cqe *cqe = &state->cqes[i];
+ struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
if (ctx->task_complete) {
@@ -941,19 +916,22 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
}
-bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
- bool allow_overflow)
+/*
+ * A helper for multishot requests posting additional CQEs.
+ * Should only be used from a task_work including IO_URING_F_MULTISHOT.
+ */
+bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 user_data = req->cqe.user_data;
struct io_uring_cqe *cqe;
if (!defer)
- return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+ return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
lockdep_assert_held(&ctx->uring_lock);
- if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {
+ if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
__io_cq_lock(ctx);
__io_flush_post_cqes(ctx);
/* no need to flush - flush is deferred */
@@ -964,10 +942,10 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
* however it's main job is to prevent unbounded posted completions,
* and in that it works just as well.
*/
- if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
return false;
- cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
+ cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
cqe->user_data = user_data;
cqe->res = res;
cqe->flags = cflags;
@@ -980,8 +958,10 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
struct io_rsrc_node *rsrc_node = NULL;
io_cq_lock(ctx);
- if (!(req->flags & REQ_F_CQE_SKIP))
- io_fill_cqe_req(ctx, req);
+ if (!(req->flags & REQ_F_CQE_SKIP)) {
+ if (!io_fill_cqe_req(ctx, req))
+ io_req_cqe_overflow(req);
+ }
/*
* If we're the last reference to this request, add to our locked
@@ -999,8 +979,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
io_put_kbuf_comp(req);
if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
- if (!(req->flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
+ io_put_file(req);
rsrc_node = req->rsrc_node;
/*
@@ -1062,7 +1041,8 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
- req->cqe.res = 0;
+ memset(&req->cqe, 0, sizeof(req->cqe));
+ memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -1507,7 +1487,8 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
-void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
{
do {
@@ -1534,8 +1515,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
}
- if (!(req->flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
+ io_put_file(req);
io_req_put_rsrc_locked(req, ctx);
@@ -1545,7 +1525,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
} while (node);
}
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
+void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
@@ -1560,7 +1540,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP) &&
- unlikely(!__io_fill_cqe_req(ctx, req))) {
+ unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->task_complete) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
@@ -1616,7 +1596,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
- int ret = 0;
unsigned long check_cq;
if (!io_allowed_run_tw(ctx))
@@ -1642,6 +1621,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
return 0;
do {
+ int ret = 0;
+
/*
* If a submit got punted to a workqueue, we can have the
* application entering polling for a command before it gets
@@ -1670,13 +1651,18 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
break;
}
ret = io_do_iopoll(ctx, !min);
- if (ret < 0)
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (task_sigpending(current))
+ return -EINTR;
+ if (need_resched())
break;
+
nr_events += ret;
- ret = 0;
- } while (nr_events < min && !need_resched());
+ } while (nr_events < min);
- return ret;
+ return 0;
}
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
@@ -1948,6 +1934,14 @@ fail:
ret = io_issue_sqe(req, issue_flags);
if (ret != -EAGAIN)
break;
+
+ /*
+ * If REQ_F_NOWAIT is set, then don't wait or retry with
+ * poll. -EAGAIN is final for that case.
+ */
+ if (req->flags & REQ_F_NOWAIT)
+ break;
+
/*
* We can get EAGAIN for iopolled IO even though we're
* forcing a sync submission from here, since we can't
@@ -2353,8 +2347,21 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
*/
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{
- unsigned head, mask = ctx->sq_entries - 1;
- unsigned sq_idx = ctx->cached_sq_head++ & mask;
+ unsigned mask = ctx->sq_entries - 1;
+ unsigned head = ctx->cached_sq_head++ & mask;
+
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
+ head = READ_ONCE(ctx->sq_array[head]);
+ if (unlikely(head >= ctx->sq_entries)) {
+ /* drop invalid entries */
+ spin_lock(&ctx->completion_lock);
+ ctx->cq_extra--;
+ spin_unlock(&ctx->completion_lock);
+ WRITE_ONCE(ctx->rings->sq_dropped,
+ READ_ONCE(ctx->rings->sq_dropped) + 1);
+ return false;
+ }
+ }
/*
* The cached sq head (or cq tail) serves two purposes:
@@ -2364,20 +2371,12 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = READ_ONCE(ctx->sq_array[sq_idx]);
- if (likely(head < ctx->sq_entries)) {
- /* double index for 128-byte SQEs, twice as long */
- if (ctx->flags & IORING_SETUP_SQE128)
- head <<= 1;
- *sqe = &ctx->sq_sqes[head];
- return true;
- }
- /* drop invalid entries */
- ctx->cq_extra--;
- WRITE_ONCE(ctx->rings->sq_dropped,
- READ_ONCE(ctx->rings->sq_dropped) + 1);
- return false;
+ /* double index for 128-byte SQEs, twice as long */
+ if (ctx->flags & IORING_SETUP_SQE128)
+ head <<= 1;
+ *sqe = &ctx->sq_sqes[head];
+ return true;
}
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
@@ -2476,19 +2475,30 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
if (io_run_local_work(ctx) > 0)
- return 1;
+ return 0;
}
if (io_run_task_work() > 0)
- return 1;
+ return 0;
if (task_sigpending(current))
return -EINTR;
return 0;
}
+static bool current_pending_io(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx)
+ return false;
+ return percpu_counter_read_positive(&tctx->inflight);
+}
+
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
+ int io_wait, ret;
+
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
if (unlikely(!llist_empty(&ctx->work_llist)))
@@ -2499,11 +2509,22 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
return -EINTR;
if (unlikely(io_should_wake(iowq)))
return 0;
+
+ /*
+ * Mark us as being in io_wait if we have pending requests, so cpufreq
+ * can take into account that the task is waiting for IO - turns out
+ * to be important for low QD IO.
+ */
+ io_wait = current->in_iowait;
+ if (current_pending_io())
+ current->in_iowait = 1;
+ ret = 0;
if (iowq->timeout == KTIME_MAX)
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 0;
+ ret = -ETIME;
+ current->in_iowait = io_wait;
+ return ret;
}
/*
@@ -2613,14 +2634,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
static void io_mem_free(void *ptr)
{
- struct page *page;
-
if (!ptr)
return;
- page = virt_to_head_page(ptr);
- if (put_page_testzero(page))
- free_compound_page(page);
+ folio_put(virt_to_folio(ptr));
}
static void io_pages_free(struct page ***pages, int npages)
@@ -2735,6 +2752,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return SIZE_MAX;
#endif
+ if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
+ if (sq_offset)
+ *sq_offset = SIZE_MAX;
+ return off;
+ }
+
if (sq_offset)
*sq_offset = off;
@@ -2877,7 +2900,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->dummy_ubuf);
kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
@@ -3418,8 +3440,6 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
- struct vm_unmapped_area_info info;
void *ptr;
/*
@@ -3434,32 +3454,29 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
if (IS_ERR(ptr))
return -ENOMEM;
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+ /*
+ * Some architectures have strong cache aliasing requirements.
+ * For such architectures we need a coherent mapping which aliases
+ * kernel memory *and* userspace memory. To achieve that:
+ * - use a NULL file pointer to reference physical memory, and
+ * - use the kernel virtual address of the shared io_uring context
+ * (instead of the userspace-provided address, which has to be 0UL
+ * anyway).
+ * - use the same pgoff which the get_unmapped_area() uses to
+ * calculate the page colouring.
+ * For architectures without such aliasing requirements, the
+ * architecture will return any suitable mapping because addr is 0.
+ */
+ filp = NULL;
+ flags |= MAP_SHARED;
+ pgoff = 0; /* has been translated to ptr above */
#ifdef SHM_COLOUR
- info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
+ addr = (uintptr_t) ptr;
+ pgoff = addr >> PAGE_SHIFT;
#else
- info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
+ addr = 0UL;
#endif
- info.align_offset = (unsigned long) ptr;
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- addr = vm_unmapped_area(&info);
- if (offset_in_page(addr)) {
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = mmap_end;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
#else /* !CONFIG_MMU */
@@ -3712,7 +3729,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return PTR_ERR(rings);
ctx->rings = rings;
- ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+ ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
rings->cq_ring_mask = p->cq_entries - 1;
rings->sq_ring_entries = p->sq_entries;
@@ -3841,6 +3859,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
!(ctx->flags & IORING_SETUP_SQPOLL))
ctx->task_complete = true;
+ if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
+ ctx->lockless_cq = true;
+
/*
* lazy poll_wq activation relies on ->task_complete for synchronisation
* purposes, see io_activate_pollwq()
@@ -3859,7 +3880,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->syscall_iopoll = 1;
ctx->compat = in_compat_syscall();
- if (!capable(CAP_IPC_LOCK))
+ if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
/*
@@ -3920,7 +3941,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
p->sq_off.flags = offsetof(struct io_rings, sq_flags);
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
- p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+ p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
p->sq_off.resv1 = 0;
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
p->sq_off.user_addr = 0;
@@ -4009,7 +4031,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
- IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
+ IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
+ IORING_SETUP_NO_SQARRAY))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -4172,16 +4195,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
return 0;
}
+static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
+ cpumask_var_t new_mask)
+{
+ int ret;
+
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ ret = io_wq_cpu_affinity(current->io_uring, new_mask);
+ } else {
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return ret;
+}
+
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len)
{
- struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
int ret;
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
@@ -4202,19 +4237,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
return -EFAULT;
}
- ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
+ ret = __io_register_iowq_aff(ctx, new_mask);
free_cpumask_var(new_mask);
return ret;
}
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
- return io_wq_cpu_affinity(tctx->io_wq, NULL);
+ return __io_register_iowq_aff(ctx, NULL);
}
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
@@ -4590,8 +4620,20 @@ static int __init io_uring_init(void)
io_uring_optable_init();
- req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
- SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+ /*
+ * Allow user copy in the per-command field, which starts after the
+ * file in io_kiocb and until the opcode field. The openat2 handling
+ * requires copying in user memory into the io_kiocb object in that
+ * range, and HARDENED_USERCOPY will complain if we haven't
+ * correctly annotated this range.
+ */
+ req_cachep = kmem_cache_create_usercopy("io_kiocb",
+ sizeof(struct io_kiocb), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
+ offsetof(struct io_kiocb, cmd.data),
+ sizeof_field(struct io_kiocb, cmd.data), NULL);
+
return 0;
};
__initcall(io_uring_init);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d3606d30cf6f..547c30582fb8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -38,14 +38,13 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED,
};
-struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
-bool io_req_cqe_overflow(struct io_kiocb *req);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
+void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
- bool allow_overflow);
+bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
@@ -73,7 +72,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
-void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
+void __io_submit_flush_completions(struct io_ring_ctx *ctx);
int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
@@ -110,31 +109,31 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
-static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
- bool overflow)
+static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
+ struct io_uring_cqe **ret,
+ bool overflow)
{
io_lockdep_assert_cq_locked(ctx);
- if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
- struct io_uring_cqe *cqe = ctx->cqe_cached;
-
- ctx->cached_cq_tail++;
- ctx->cqe_cached++;
- if (ctx->flags & IORING_SETUP_CQE32)
- ctx->cqe_cached++;
- return cqe;
+ if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
+ if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+ return false;
}
-
- return __io_get_cqe(ctx, overflow);
+ *ret = ctx->cqe_cached;
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached++;
+ if (ctx->flags & IORING_SETUP_CQE32)
+ ctx->cqe_cached++;
+ return true;
}
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
{
- return io_get_cqe_overflow(ctx, false);
+ return io_get_cqe_overflow(ctx, ret, false);
}
-static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
@@ -143,39 +142,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
- cqe = io_get_cqe(ctx);
- if (unlikely(!cqe))
+ if (unlikely(!io_get_cqe(ctx, &cqe)))
return false;
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
- (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
+ if (trace_io_uring_complete_enabled())
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ req->big_cqe.extra1, req->big_cqe.extra2);
memcpy(cqe, &req->cqe, sizeof(*cqe));
-
if (ctx->flags & IORING_SETUP_CQE32) {
- u64 extra1 = 0, extra2 = 0;
-
- if (req->flags & REQ_F_CQE32_INIT) {
- extra1 = req->extra1;
- extra2 = req->extra2;
- }
-
- WRITE_ONCE(cqe->big_cqe[0], extra1);
- WRITE_ONCE(cqe->big_cqe[1], extra2);
+ memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
+ memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
return true;
}
-static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- if (likely(__io_fill_cqe_req(ctx, req)))
- return true;
- return io_req_cqe_overflow(req);
-}
-
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@@ -196,10 +178,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA;
}
-static inline void io_put_file(struct file *file)
+static inline void io_put_file(struct io_kiocb *req)
{
- if (file)
- fput(file);
+ if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
+ fput(req->file);
}
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
@@ -354,7 +336,6 @@ static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
struct io_kiocb *req;
req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
- kasan_unpoison_object_data(req_cachep, req);
wq_stack_extract(&ctx->submit_state.free_list);
return req;
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 2f0181521c98..556f4df25b0f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -218,11 +218,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
- struct page *page;
-
- page = virt_to_head_page(bl->buf_ring);
- if (put_page_testzero(page))
- free_compound_page(page);
+ folio_put(virt_to_folio(bl->buf_ring));
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
diff --git a/io_uring/net.c b/io_uring/net.c
index eb1f51ddcb23..3d07bf79c1e0 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -641,8 +641,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
}
if (!mshot_finished) {
- if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- *ret, cflags | IORING_CQE_F_MORE, true)) {
+ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+ *ret, cflags | IORING_CQE_F_MORE)) {
io_recv_prep_retry(req);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
@@ -1366,8 +1366,8 @@ retry:
if (ret < 0)
return ret;
- if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret,
- IORING_CQE_F_MORE, true))
+ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+ ret, IORING_CQE_F_MORE))
goto retry;
return -ECANCELED;
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 10ca57f5bd24..e3fae26e025d 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -35,9 +35,11 @@ static bool io_openat_force_async(struct io_open *open)
{
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
- * it'll always -EAGAIN
+ * it'll always -EAGAIN. Note that we test for __O_TMPFILE because
+ * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
+ * async for.
*/
- return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE);
+ return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
}
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index d4597efe14a7..4c360ba8793a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
__poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events);
- if (!io_aux_cqe(req, ts->locked, mask,
- IORING_CQE_F_MORE, false)) {
+ if (!io_fill_cqe_req_aux(req, ts->locked, mask,
+ IORING_CQE_F_MORE)) {
io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES;
}
@@ -824,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
- if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
- req->file != cd->file)
- continue;
- if (cd->seq == req->work.cancel_seq)
- continue;
- req->work.cancel_seq = cd->seq;
- *out_bucket = hb;
- return req;
+ if (io_cancel_req_match(req, cd)) {
+ *out_bucket = hb;
+ return req;
+ }
}
spin_unlock(&hb->lock);
}
@@ -855,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
struct io_hash_bucket *bucket;
struct io_kiocb *req;
- if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
+ IORING_ASYNC_CANCEL_ANY))
req = io_poll_file_find(ctx, cd, table, &bucket);
else
req = io_poll_find(ctx, false, cd, table, &bucket);
@@ -972,8 +969,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
- struct io_cancel_data cd = { .data = poll_update->old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
+ struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
struct io_hash_bucket *bucket;
struct io_kiocb *preq;
int ret2, ret = 0;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5e8fdd9b8ca6..d9c853d10587 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -33,6 +33,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
+static const struct io_mapped_ubuf dummy_ubuf = {
+ /* set invalid range, so io_import_fixed() fails meeting it */
+ .ubuf = -1UL,
+ .ubuf_end = 0,
+};
+
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -132,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot;
unsigned int i;
- if (imu != ctx->dummy_ubuf) {
+ if (imu != &dummy_ubuf) {
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
@@ -459,14 +465,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
break;
i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
- if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
+ if (ctx->user_bufs[i] != &dummy_ubuf) {
err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->user_bufs[i]);
if (unlikely(err)) {
io_buffer_unmap(ctx, &imu);
break;
}
- ctx->user_bufs[i] = ctx->dummy_ubuf;
+ ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
}
ctx->user_bufs[i] = imu;
@@ -1077,7 +1083,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
int ret, nr_pages, i;
struct folio *folio = NULL;
- *pimu = ctx->dummy_ubuf;
+ *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
return 0;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 0a8a95e9b99e..8625181fb87a 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -54,10 +54,9 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
- struct bio_vec bvec[];
+ struct bio_vec bvec[] __counted_by(nr_bvecs);
};
-void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1bce2208b65c..c8c822fa7980 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} else {
rw->kiocb.ki_ioprio = get_current_ioprio();
}
+ rw->kiocb.dio_complete = NULL;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
@@ -220,17 +221,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
}
#endif
-static void kiocb_end_write(struct io_kiocb *req)
+static void io_req_end_write(struct io_kiocb *req)
{
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
if (req->flags & REQ_F_ISREG) {
- struct super_block *sb = file_inode(req->file)->i_sb;
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- __sb_writers_acquired(sb, SB_FREEZE_WRITE);
- sb_end_write(sb);
+ kiocb_end_write(&rw->kiocb);
}
}
@@ -243,7 +239,7 @@ static void io_req_io_end(struct io_kiocb *req)
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
if (rw->kiocb.ki_flags & IOCB_WRITE) {
- kiocb_end_write(req);
+ io_req_end_write(req);
fsnotify_modify(req->file);
} else {
fsnotify_access(req->file);
@@ -285,6 +281,15 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct kiocb *kiocb = &rw->kiocb;
+
+ if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
+ long res = kiocb->dio_complete(rw->kiocb.private);
+
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
+
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
@@ -300,9 +305,11 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
- if (__io_complete_rw_common(req, res))
- return;
- io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
+ if (__io_complete_rw_common(req, res))
+ return;
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
req->io_task_work.func = io_req_rw_complete;
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
@@ -313,7 +320,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
@@ -902,19 +909,18 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+ if (req->flags & REQ_F_ISREG)
+ kiocb_start_write(kiocb);
+ kiocb->ki_flags |= IOCB_WRITE;
+
/*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
+ * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
+ * groks deferring the completion to task context. This isn't
+ * necessary and useful for polled IO as that can always complete
+ * directly.
*/
- if (req->flags & REQ_F_ISREG) {
- sb_start_write(file_inode(req->file)->i_sb);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
+ if (!(kiocb->ki_flags & IOCB_HIPRI))
+ kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
if (likely(req->file->f_op->write_iter))
ret2 = call_write_iter(req->file, kiocb, &s->iter);
@@ -961,7 +967,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
io->bytes_done += ret2;
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
return ret ? ret : -EAGAIN;
}
done:
@@ -972,7 +978,7 @@ copy_iov:
ret = io_setup_async_rw(req, iovec, s, false);
if (!ret) {
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
return -EAGAIN;
}
return ret;
@@ -983,13 +989,6 @@ copy_iov:
return ret;
}
-static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
-{
- io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_cqring_wake(ctx);
-}
-
void io_rw_fail(struct io_kiocb *req)
{
int res;
@@ -1060,24 +1059,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed))
break;
nr_events++;
- if (unlikely(req->flags & REQ_F_CQE_SKIP))
- continue;
-
req->cqe.flags = io_put_kbuf(req, 0);
- if (unlikely(!__io_fill_cqe_req(ctx, req))) {
- spin_lock(&ctx->completion_lock);
- io_req_cqe_overflow(req);
- spin_unlock(&ctx->completion_lock);
- }
}
-
if (unlikely(!nr_events))
return 0;
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
pos = start ? start->next : ctx->iopoll_list.first;
wq_list_cut(&ctx->iopoll_list, prev, start);
- io_free_batch_list(ctx, pos);
+
+ if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
+ return 0;
+ ctx->submit_state.compl_reqs.first = pos;
+ __io_submit_flush_completions(ctx);
return nr_events;
}
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 2a4bbb719531..7c4469e9540e 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
+ fput(in);
done:
if (ret != sp->len)
req_set_fail(req);
@@ -112,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
+ fput(in);
done:
if (ret != sp->len)
req_set_fail(req);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 5e329e3cd470..ee2d2c687fda 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -421,3 +421,18 @@ err:
io_sq_thread_finish(ctx);
return ret;
}
+
+__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
+ cpumask_var_t mask)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+ int ret = -EINVAL;
+
+ if (sqd) {
+ io_sq_thread_park(sqd);
+ ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
+ io_sq_thread_unpark(sqd);
+ }
+
+ return ret;
+}
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index e1b8d508d22d..8df37e8c9149 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -27,3 +27,4 @@ void io_sq_thread_park(struct io_sq_data *sqd);
void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
+int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index fb0547b35dcd..7fd7dbb211d6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
if (!io_timeout_finish(timeout, data)) {
bool filled;
- filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE,
- false);
+ filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
+ IORING_CQE_F_MORE);
if (filled) {
/* re-arm timer */
spin_lock_irq(&ctx->timeout_lock);
@@ -268,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
list_for_each_entry(timeout, &ctx->timeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
- if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
- cd->data != tmp->cqe.user_data)
- continue;
- if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
- if (cd->seq == tmp->work.cancel_seq)
- continue;
- tmp->work.cancel_seq = cd->seq;
+ if (io_cancel_req_match(tmp, cd)) {
+ req = tmp;
+ break;
}
- req = tmp;
- break;
}
if (!req)
return ERR_PTR(-ENOENT);
@@ -409,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
- struct io_cancel_data cd = { .data = user_data, };
+ struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data;
@@ -473,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret;
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
- struct io_cancel_data cd = { .data = tr->addr, };
+ struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, };
spin_lock(&ctx->completion_lock);
ret = io_timeout_cancel(ctx, &cd);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 476c7877ce58..537795fddc87 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -7,6 +7,7 @@
#include <linux/nospec.h>
#include <uapi/linux/io_uring.h>
+#include <uapi/asm-generic/ioctls.h>
#include "io_uring.h"
#include "rsrc.h"
@@ -42,9 +43,8 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2)
{
- req->extra1 = extra1;
- req->extra2 = extra2;
- req->flags |= REQ_F_CQE32_INIT;
+ req->big_cqe.extra1 = extra1;
+ req->big_cqe.extra2 = extra2;
}
/*
@@ -164,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
return io_import_fixed(rw, iter, req->imu, ubuf, len);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct socket *sock = cmd->file->private_data;
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ switch (cmd->sqe->cmd_op) {
+ case SOCKET_URING_OP_SIOCINQ:
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);