summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2026-03-14 17:57:15 +0300
committerJens Axboe <axboe@kernel.dk>2026-03-14 17:57:15 +0300
commit0e46cb553f35517f9ac946de0f1a2cfab7c156b6 (patch)
treeb1fc9361271dbe9bf38f537488b7051ddd4e3ea5
parentcb9487333652b2cfb4f10ef596fc5b675241cae9 (diff)
parentc2c185be5c85d37215397c8e8781abf0a69bec1f (diff)
downloadlinux-0e46cb553f35517f9ac946de0f1a2cfab7c156b6.tar.xz
Merge branch 'io_uring-7.0' into for-7.1/io_uring
Merge upstream io_uring fixes to avoid conflicts in later patches. * io_uring-7.0: io_uring/kbuf: check if target buffer list is still legacy on recycle io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops io_uring/eventfd: use ctx->rings_rcu for flags checking io_uring: ensure ctx->rings is stable for task work flags manipulation io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration io_uring/register: fix comment about task_no_new_privs
-rw-r--r--include/linux/io_uring_types.h1
-rw-r--r--io_uring/bpf_filter.c2
-rw-r--r--io_uring/eventfd.c10
-rw-r--r--io_uring/io_uring.c4
-rw-r--r--io_uring/kbuf.c13
-rw-r--r--io_uring/register.c15
-rw-r--r--io_uring/tw.c22
7 files changed, 56 insertions, 11 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3e4a82a6f817..dd1420bfcb73 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -388,6 +388,7 @@ struct io_ring_ctx {
* regularly bounce b/w CPUs.
*/
struct {
+ struct io_rings __rcu *rings_rcu;
struct llist_head work_llist;
struct llist_head retry_llist;
unsigned long check_cq;
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 6a98750e38b0..c0037632b7af 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
do {
if (filter == &dummy_filter)
return -EACCES;
- ret = bpf_prog_run(filter->prog, &bpf_ctx);
+ ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
if (!ret)
return -EACCES;
filter = filter->next;
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index cbea1c289485..7482a7dc6b38 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
bool skip = false;
struct io_ev_fd *ev_fd;
-
- if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return;
+ struct io_rings *rings;
guard(rcu)();
+
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (!rings)
+ return;
+ if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+ return;
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index aa95703165f1..9a37035e76c0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
* well as 2 contiguous entries.
*/
if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
- !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
+ (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1)
return io_init_fail_req(req, -EINVAL);
/*
* A 128b operation on a mixed SQ uses two entries, so we have
@@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
io_free_region(ctx->user, &ctx->sq_region);
io_free_region(ctx->user, &ctx->ring_region);
ctx->rings = NULL;
+ RCU_INIT_POINTER(ctx->rings_rcu, NULL);
ctx->sq_sqes = NULL;
}
@@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (ret)
return ret;
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
+ rcu_assign_pointer(ctx->rings_rcu, rings);
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 2ffa95b1c601..26813b0f1dfd 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
- list_add(&buf->list, &bl->buf_list);
- bl->nbufs++;
+ /*
+ * If the buffer list was upgraded to a ring-based one, or removed,
+ * while the request was in-flight in io-wq, drop it.
+ */
+ if (bl && !(bl->flags & IOBL_BUF_RING)) {
+ list_add(&buf->list, &bl->buf_list);
+ bl->nbufs++;
+ } else {
+ kfree(buf);
+ }
req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
io_ring_submit_unlock(ctx, issue_flags);
return true;
diff --git a/io_uring/register.c b/io_uring/register.c
index 6015a3e9ce69..0148735f7711 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
return -EPERM;
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
- * is true and we're not CAP_SYS_ADMIN.
+ * is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
- * is true and we're not CAP_SYS_ADMIN.
+ * is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -633,7 +633,15 @@ overflow:
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
+ /*
+ * Just mark any flag we may have missed and that the application
+ * should act on unconditionally. Worst case it'll be an extra
+ * syscall.
+ */
+ atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
ctx->rings = n.rings;
+ rcu_assign_pointer(ctx->rings_rcu, n.rings);
+
ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
@@ -642,6 +650,9 @@ overflow:
out:
spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->mmap_lock);
+ /* Wait for concurrent io_ctx_mark_taskrun() */
+ if (to_free == &o)
+ synchronize_rcu_expedited();
io_register_free_rings(ctx, to_free);
if (ctx->sq_data)
diff --git a/io_uring/tw.c b/io_uring/tw.c
index 1ee2b8ab07c8..2f2b4ac4b126 100644
--- a/io_uring/tw.c
+++ b/io_uring/tw.c
@@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
+/*
+ * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the
+ * RCU protected rings pointer to be safe against concurrent ring resizing.
+ */
+static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx)
+{
+ lockdep_assert_in_rcu_read_lock();
+
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) {
+ struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
+
+ atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags);
+ }
+}
+
void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
if (!head) {
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ io_ctx_mark_taskrun(ctx);
if (ctx->has_evfd)
io_eventfd_signal(ctx, false);
}
@@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req)
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
+ /*
+ * Doesn't need to use ->rings_rcu, as resizing isn't supported for
+ * !DEFER_TASKRUN.
+ */
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);