diff options
| author | Jens Axboe <axboe@kernel.dk> | 2026-03-14 17:57:15 +0300 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2026-03-14 17:57:15 +0300 |
| commit | 0e46cb553f35517f9ac946de0f1a2cfab7c156b6 (patch) | |
| tree | b1fc9361271dbe9bf38f537488b7051ddd4e3ea5 | |
| parent | cb9487333652b2cfb4f10ef596fc5b675241cae9 (diff) | |
| parent | c2c185be5c85d37215397c8e8781abf0a69bec1f (diff) | |
| download | linux-0e46cb553f35517f9ac946de0f1a2cfab7c156b6.tar.xz | |
Merge branch 'io_uring-7.0' into for-7.1/io_uring
Merge upstream io_uring fixes to avoid conflicts in later patches.
* io_uring-7.0:
io_uring/kbuf: check if target buffer list is still legacy on recycle
io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops
io_uring/eventfd: use ctx->rings_rcu for flags checking
io_uring: ensure ctx->rings is stable for task work flags manipulation
io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration
io_uring/register: fix comment about task_no_new_privs
| -rw-r--r-- | include/linux/io_uring_types.h | 1 | ||||
| -rw-r--r-- | io_uring/bpf_filter.c | 2 | ||||
| -rw-r--r-- | io_uring/eventfd.c | 10 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 4 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 13 | ||||
| -rw-r--r-- | io_uring/register.c | 15 | ||||
| -rw-r--r-- | io_uring/tw.c | 22 |
7 files changed, 56 insertions, 11 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..dd1420bfcb73 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -388,6 +388,7 @@ struct io_ring_ctx { * regularly bounce b/w CPUs. */ struct { + struct io_rings __rcu *rings_rcu; struct llist_head work_llist; struct llist_head retry_llist; unsigned long check_cq; diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 6a98750e38b0..c0037632b7af 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, do { if (filter == &dummy_filter) return -EACCES; - ret = bpf_prog_run(filter->prog, &bpf_ctx); + ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx); if (!ret) return -EACCES; filter = filter->next; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index cbea1c289485..7482a7dc6b38 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { bool skip = false; struct io_ev_fd *ev_fd; - - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; + struct io_rings *rings; guard(rcu)(); + + rings = rcu_dereference(ctx->rings_rcu); + if (!rings) + return; + if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists in case an io_eventfd_unregister call diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index aa95703165f1..9a37035e76c0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, * well as 2 contiguous entries. */ if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || - !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1) return io_init_fail_req(req, -EINVAL); /* * A 128b operation on a mixed SQ uses two entries, so we have @@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx) io_free_region(ctx->user, &ctx->sq_region); io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; + RCU_INIT_POINTER(ctx->rings_rcu, NULL); ctx->sq_sqes = NULL; } @@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (ret) return ret; ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); + rcu_assign_pointer(ctx->rings_rcu, rings); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2ffa95b1c601..26813b0f1dfd 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - bl->nbufs++; + /* + * If the buffer list was upgraded to a ring-based one, or removed, + * while the request was in-flight in io-wq, drop it. + */ + if (bl && !(bl->flags & IOBL_BUF_RING)) { + list_add(&buf->list, &bl->buf_list); + bl->nbufs++; + } else { + kfree(buf); + } req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; io_ring_submit_unlock(ctx, issue_flags); return true; diff --git a/io_uring/register.c b/io_uring/register.c index 6015a3e9ce69..0148735f7711 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) return -EPERM; /* * Similar to seccomp, disallow setting a filter if task_no_new_privs - * is true and we're not CAP_SYS_ADMIN. + * is false and we're not CAP_SYS_ADMIN. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) @@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) /* * Similar to seccomp, disallow setting a filter if task_no_new_privs - * is true and we're not CAP_SYS_ADMIN. + * is false and we're not CAP_SYS_ADMIN. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) @@ -633,7 +633,15 @@ overflow: ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; + /* + * Just mark any flag we may have missed and that the application + * should act on unconditionally. Worst case it'll be an extra + * syscall. + */ + atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); ctx->rings = n.rings; + rcu_assign_pointer(ctx->rings_rcu, n.rings); + ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); @@ -642,6 +650,9 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); + /* Wait for concurrent io_ctx_mark_taskrun() */ + if (to_free == &o) + synchronize_rcu_expedited(); io_register_free_rings(ctx, to_free); if (ctx->sq_data) diff --git a/io_uring/tw.c b/io_uring/tw.c index 1ee2b8ab07c8..2f2b4ac4b126 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } +/* + * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the + * RCU protected rings pointer to be safe against concurrent ring resizing. + */ +static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx) +{ + lockdep_assert_in_rcu_read_lock(); + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) { + struct io_rings *rings = rcu_dereference(ctx->rings_rcu); + + atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags); + } +} + void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; @@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) */ if (!head) { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + io_ctx_mark_taskrun(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, false); } @@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req) if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; + /* + * Doesn't need to use ->rings_rcu, as resizing isn't supported for + * !DEFER_TASKRUN. + */ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); |
