From a46435537a844d0f7b4b620baf962cad136422de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Feb 2026 11:36:09 -0700 Subject: io_uring/cmd_net: use READ_ONCE() for ->addr3 read Any SQE read should use READ_ONCE(), to ensure the result is read once and only once. Doesn't really matter for this case, but it's better to keep these 100% consistent and always use READ_ONCE() for the prep side of SQE handling. Fixes: 5d24321e4c15 ("io_uring: Introduce getsockname io_uring cmd") Signed-off-by: Jens Axboe --- io_uring/cmd_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 57ddaf874611..125a81c520a6 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -146,7 +146,7 @@ static int io_uring_cmd_getsockname(struct socket *sock, return -EINVAL; uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ulen = u64_to_user_ptr(sqe->addr3); + ulen = u64_to_user_ptr(READ_ONCE(sqe->addr3)); peer = READ_ONCE(sqe->optlen); if (peer > 1) return -EINVAL; -- cgit v1.2.3 From 85f6c439a69afe4fa8a688512e586971e97e273a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Feb 2026 10:35:57 +0000 Subject: io_uring/timeout: READ_ONCE sqe->addr We should use READ_ONCE when reading from a SQE, make sure timeout gets a stable timespec address. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/timeout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 84dda24f3eb2..cb61d4862fc6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -462,7 +462,7 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) tr->ltimeout = true; if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2)))) return -EFAULT; if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) return -EINVAL; @@ -557,7 +557,7 @@ static int __io_timeout_prep(struct io_kiocb *req, data->req = req; data->flags = flags; - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) + if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr)))) return -EFAULT; if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) -- cgit v1.2.3 From 3d17d76d1ffb139a7492317b196ee03c8eabc9dc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 27 Feb 2026 09:07:45 -0800 Subject: io_uring/zcrx: don't set rx_page_size when not requested The rx_buf_len parameter was recently added to the Rx zero-copy implementation. The expectation is that when not set system will maintain previous behavior and use the default buffer size (PAGE_SIZE). This works correctly at the iouring level, but we don't preserve the same "zero means default" semantics when registering the memory provider on the netdev. mp_param.rx_page_size is unconditionally set to PAGE_SIZE. This causes __net_mp_open_rxq() to check for QCFG_RX_PAGE_SIZE support in the driver, and return -EOPNOTSUPP for drivers that don't advertise it -- even though the user never asked for large buffers. Only set mp_param.rx_page_size when rx_buf_len was explicitly provided, so that the default page size path works on all zcrx-capable drivers. mlx5 and fbnic only support 4kB pages in the current release. Fixes: 795663b4d160 ("io_uring/zcrx: implement large rx buffer support") Signed-off-by: Jakub Kicinski Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 97984a73a95d..19b287d21f4b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -837,7 +837,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto netdev_put_unlock; - mp_param.rx_page_size = 1U << ifq->niov_shift; + if (reg.rx_buf_len) + mp_param.rx_page_size = 1U << ifq->niov_shift; mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); -- cgit v1.2.3 From c36e28becd0586ac98318fd335e5e91d19cd2623 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2026 14:32:04 +0000 Subject: io_uring/net: reject SEND_VECTORIZED when unsupported IORING_SEND_VECTORIZED with registered buffers is not implemented but could be. Don't silently ignore the flag in this case but reject it with an error. It only affects sendzc as normal sends don't support registered buffers. Fixes: 6f02527729bd3 ("io_uring/net: Allow to do vectorized send") Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 8576c6cb2236..d27adbe3f20b 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -375,6 +375,8 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_namelen = addr_len; } if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + if (sr->flags & IORING_SEND_VECTORIZED) + return -EINVAL; req->flags |= REQ_F_IMPORT_BUFFER; return 0; } -- cgit v1.2.3 From 531bb98a030cc1073bd7ed9a502c0a3a781e92ee Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 4 Mar 2026 12:37:43 +0000 Subject: io_uring/zcrx: use READ_ONCE with user shared RQEs Refill queue entries are shared with the user space, use READ_ONCE when reading them. Fixes: 34a3e60821ab9 ("io_uring/zcrx: implement zerocopy receive pp memory provider"); Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 19b287d21f4b..0461edebb042 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -927,11 +927,12 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, struct io_zcrx_ifq *ifq, struct net_iov **ret_niov) { + __u64 off = READ_ONCE(rqe->off); unsigned niov_idx, area_idx; struct io_zcrx_area *area; - area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; - niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; + area_idx = off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; if (unlikely(rqe->__pad || area_idx)) return false; -- cgit v1.2.3 From 3306a589e598b50a5bbdfe837371670b507043c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 9 Mar 2026 15:34:41 +0100 Subject: io_uring/register: fix comment about task_no_new_privs The actual code is right, but the comment is the wrong way around. Fixes: ed82f35b926b ("io_uring: allow registration of per-task restrictions") Signed-off-by: Jann Horn Signed-off-by: Jens Axboe --- io_uring/register.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/register.c b/io_uring/register.c index 594b1f2ce875..a839b22fd392 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) return -EPERM; /* * Similar to seccomp, disallow setting a filter if task_no_new_privs - * is true and we're not CAP_SYS_ADMIN. + * is false and we're not CAP_SYS_ADMIN. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) @@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) /* * Similar to seccomp, disallow setting a filter if task_no_new_privs - * is true and we're not CAP_SYS_ADMIN. + * is false and we're not CAP_SYS_ADMIN. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) -- cgit v1.2.3 From 785d4625d3e05bb0ac536ff4fd74d096cfe51714 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:20:14 -0600 Subject: io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration Since the caller, __io_uring_run_bpf_filters(), doesn't prevent migration, it should use the migration disabling variant for running the BPF program. Fixes: d42eb05e60fe ("io_uring: add support for BPF filtering for opcode restrictions") Signed-off-by: Jens Axboe --- io_uring/bpf_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 28a23e92ee81..9cc44764e0ac 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, do { if (filter == &dummy_filter) return -EACCES; - ret = bpf_prog_run(filter->prog, &bpf_ctx); + ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx); if (!ret) return -EACCES; filter = filter->next; -- cgit v1.2.3 From 96189080265e6bb5dde3a4afbaf947af493e3f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:21:37 -0600 Subject: io_uring: ensure ctx->rings is stable for task work flags manipulation If DEFER_TASKRUN | SETUP_TASKRUN is used and task work is added while the ring is being resized, it's possible for the OR'ing of IORING_SQ_TASKRUN to happen in the small window of swapping into the new rings and the old rings being freed. Prevent this by adding a 2nd ->rings pointer, ->rings_rcu, which is protected by RCU. The task work flags manipulation is inside RCU already, and if the resize ring freeing is done post an RCU synchronize, then there's no need to add locking to the fast path of task work additions. Note: this is only done for DEFER_TASKRUN, as that's the only setup mode that supports ring resizing. If this ever changes, then they too need to use the io_ctx_mark_taskrun() helper. Link: https://lore.kernel.org/io-uring/20260309062759.482210-1-naup96721@gmail.com/ Cc: stable@vger.kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reported-by: Hao-Yu Yang Suggested-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 2 ++ io_uring/register.c | 11 +++++++++++ io_uring/tw.c | 22 ++++++++++++++++++++-- 4 files changed, 34 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..dd1420bfcb73 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -388,6 +388,7 @@ struct io_ring_ctx { * regularly bounce b/w CPUs. */ struct { + struct io_rings __rcu *rings_rcu; struct llist_head work_llist; struct llist_head retry_llist; unsigned long check_cq; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ccab8562d273..20fdc442e014 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx) io_free_region(ctx->user, &ctx->sq_region); io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; + RCU_INIT_POINTER(ctx->rings_rcu, NULL); ctx->sq_sqes = NULL; } @@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (ret) return ret; ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); + rcu_assign_pointer(ctx->rings_rcu, rings); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); diff --git a/io_uring/register.c b/io_uring/register.c index a839b22fd392..5f3eb018fb32 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -633,7 +633,15 @@ overflow: ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; + /* + * Just mark any flag we may have missed and that the application + * should act on unconditionally. Worst case it'll be an extra + * syscall. + */ + atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); ctx->rings = n.rings; + rcu_assign_pointer(ctx->rings_rcu, n.rings); + ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); @@ -642,6 +650,9 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); + /* Wait for concurrent io_ctx_mark_taskrun() */ + if (to_free == &o) + synchronize_rcu_expedited(); io_register_free_rings(ctx, to_free); if (ctx->sq_data) diff --git a/io_uring/tw.c b/io_uring/tw.c index 1ee2b8ab07c8..2f2b4ac4b126 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } +/* + * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the + * RCU protected rings pointer to be safe against concurrent ring resizing. + */ +static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx) +{ + lockdep_assert_in_rcu_read_lock(); + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) { + struct io_rings *rings = rcu_dereference(ctx->rings_rcu); + + atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags); + } +} + void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; @@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) */ if (!head) { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + io_ctx_mark_taskrun(ctx); if (ctx->has_evfd) io_eventfd_signal(ctx, false); } @@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req) if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; + /* + * Doesn't need to use ->rings_rcu, as resizing isn't supported for + * !DEFER_TASKRUN. + */ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); -- cgit v1.2.3 From 177c69432161f6e4bab07ccacf8a1748a6898a6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Mar 2026 14:35:49 -0600 Subject: io_uring/eventfd: use ctx->rings_rcu for flags checking Similarly to what commit e78f7b70e837 did for local task work additions, use ->rings_rcu under RCU rather than dereference ->rings directly. See that commit for more details. Cc: stable@vger.kernel.org Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 78f8ab7db104..ab789e1ebe91 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { bool skip = false; struct io_ev_fd *ev_fd; - - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return; + struct io_rings *rings; guard(rcu)(); + + rings = rcu_dereference(ctx->rings_rcu); + if (!rings) + return; + if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists in case an io_eventfd_unregister call -- cgit v1.2.3 From 6f02c6b196036dbb6defb4647d8707d29b7fe95b Mon Sep 17 00:00:00 2001 From: Tom Ryan Date: Mon, 9 Mar 2026 22:20:02 -0700 Subject: io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops When IORING_SETUP_SQE_MIXED is used without IORING_SETUP_NO_SQARRAY, the boundary check for 128-byte SQE operations in io_init_req() validated the logical SQ head position rather than the physical SQE index. The existing check: !(ctx->cached_sq_head & (ctx->sq_entries - 1)) ensures the logical position isn't at the end of the ring, which is correct for NO_SQARRAY rings where physical == logical. However, when sq_array is present, an unprivileged user can remap any logical position to an arbitrary physical index via sq_array. Setting sq_array[N] = sq_entries - 1 places a 128-byte operation at the last physical SQE slot, causing the 128-byte memcpy in io_uring_cmd_sqe_copy() to read 64 bytes past the end of the SQE array. Replace the cached_sq_head alignment check with a direct validation of the physical SQE index, which correctly handles both sq_array and NO_SQARRAY cases. Fixes: 1cba30bf9fdd ("io_uring: add support for IORING_SETUP_SQE_MIXED") Signed-off-by: Tom Ryan Link: https://patch.msgid.link/20260310052003.72871-1-ryan36005@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 20fdc442e014..20ec8fdafcae 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, * well as 2 contiguous entries. */ if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || - !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1) return io_init_fail_req(req, -EINVAL); /* * A 128b operation on a mixed SQ uses two entries, so we have -- cgit v1.2.3 From c2c185be5c85d37215397c8e8781abf0a69bec1f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Mar 2026 08:59:25 -0600 Subject: io_uring/kbuf: check if target buffer list is still legacy on recycle There's a gap between when the buffer was grabbed and when it potentially gets recycled, where if the list is empty, someone could've upgraded it to a ring provided type. This can happen if the request is forced via io-wq. The legacy recycling is missing checking if the buffer_list still exists, and if it's of the correct type. Add those checks. Cc: stable@vger.kernel.org Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers") Reported-by: Keenan Dong Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index dae5b4ab3819..e7f444953dfb 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -111,9 +111,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - bl->nbufs++; + /* + * If the buffer list was upgraded to a ring-based one, or removed, + * while the request was in-flight in io-wq, drop it. + */ + if (bl && !(bl->flags & IOBL_BUF_RING)) { + list_add(&buf->list, &bl->buf_list); + bl->nbufs++; + } else { + kfree(buf); + } req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; io_ring_submit_unlock(ctx, issue_flags); return true; -- cgit v1.2.3