summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-16 10:23:59 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-16 10:23:59 +0300
commit9b40ba14edcdf70240af8114092a76f75f070774 (patch)
tree7fd6958aa2c7ab93d86d2afed222b37c67e1c21b /include/linux
parentd29fd593e6836c96c6fd6df2b0cc6a47dda21b74 (diff)
parentd9b710f683dc68b5c0b7dd0c6c64aeb5d27a1ac4 (diff)
downloadlinux-9b40ba14edcdf70240af8114092a76f75f070774.tar.xz
Merge tag 'for-7.2/io_uring-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Rework the task_work infrastructure. Both the local (DEFER_TASKRUN) and the normal (tctx) task_work lists were llist based, which is LIFO ordered, and hence each run had to do an O(n) list reversal pass first to restore queue order. Additionally, to cap the amount of task_work run, each method needed a retry list as well. Add a lockless MPCS FIFO queue (based on Dmitry Vyukov's intrusive MPSC algorithm) and switch both task_work lists to it. It performs better than llists and we can then also ditch the retry lists as well as entries are popped one-at-the-time. On top of those changes, run the tctx fallback task_work directly and remove the now-unused per-ctx fallback machinery entirely. - zcrx user notifications. Add a mechanism for zcrx to communicate conditions back to userspace via a dedicated CQE, with the initial users being notification on running out of buffers and on a frag copy fallback, plus shared-memory notification statistics. Alongside that, a series of zcrx reliability and cleanup fixes: more reliable scrubbing, poisoning pointers on unregistration, dropping an extra ifq close, adding a ctx back-pointer, reordering fd allocation in the export path, and killing a dead 'sock' member. - Allow using io_uring registered buffers for plain SEND and RECV, not just for the zero-copy send path. This enables targets like ublk's NBD backend to push/pull IO data directly to/from a registered buffer over a plain send/recv on a TCP socket. - Registered buffer improvements: account huge pages correctly, bump the io_mapped_ubuf length field to size_t, and raise the previous 1GB registered buffer size limit. - Restrict the ctx access exposed to io_uring BPF struct_ops programs by handing them an opaque type rather than the full io_ring_ctx, and add a separate MAINTAINERS entry for the bpf-ops code. - Allow opcode filtering on IORING_OP_CONNECT. - Validate ring-provided buffer addresses with access_ok(), and align the legacy buffer add limit with MAX_BIDS_PER_BGID. - Various other cleanups and minor fixes, including avoiding msghdr async data on connect/bind, dropping async_size for OP_LISTEN, making the POLL_FIRST receive side checks consistent, re-checking IO_WQ_BIT_EXIT for each linked work item, and using trace_call__##name() at guarded tracepoint call sites. * tag 'for-7.2/io_uring-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (31 commits) io_uring/bpf-ops: add a separate maintainer entry io_uring/net: make POLL_FIRST receive side checks consistent io_uring: remove the per-ctx fallback task_work machinery io_uring: run the tctx task_work fallback directly io_uring: switch normal task_work to a mpscq io_uring: switch local task_work to a mpscq io_uring/mpscq: add lockless multi-producer, single-consumer FIFO queue io_uring: grab RCU read lock marking task run io_uring/zcrx: kill dead 'sock' member in struct io_zcrx_args io_uring/kbuf: validate ring provided buffer addresses with access_ok() io_uring/net: support registered buffer for plain send and recv io_uring/nop: Drop a wrong comment in struct io_nop io_uring/net: Remove async_size for OP_LISTEN io_uring/net: Avoid msghdr on op_connect/op_bind async data io_uring/bpf-ops: restrict ctx access to BPF io_uring/io-wq: re-check IO_WQ_BIT_EXIT for each linked work item io_uring/kbuf: align legacy buffer add limit with MAX_BIDS_PER_BGID io_uring/zcrx: add shared-memory notification statistics io_uring/zcrx: notify user on frag copy fallback io_uring/zcrx: notify user when out of buffers ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/io_uring_types.h46
1 files changed, 38 insertions, 8 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 244392026c6d..6415a3353ee0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,6 +55,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+/*
+ * Lockless multi-producer, single-consumer FIFO queue, see
+ * io_uring/mpscq.h for the implementation and rules. Defined here so
+ * that it can be embedded in io_ring_ctx. This is the producer side
+ * only - the consumer cursor is kept separately, on a cacheline that
+ * isn't dirtied by the producers.
+ */
+struct mpscq {
+ struct llist_node *tail; /* producers */
+ struct llist_node stub;
+};
+
struct io_wq_work {
struct io_wq_work_node list;
atomic_t flags;
@@ -119,6 +131,11 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct task_struct *task;
struct io_wq *io_wq;
+ /*
+ * Consumer cursor for ->task_list. Only popped by the task itself,
+ * or by ->fallback_work once the task can no longer run task_work.
+ */
+ struct llist_node *task_head;
struct file *registered_rings[IO_RINGFD_REG_MAX];
struct xarray xa;
@@ -127,8 +144,13 @@ struct io_uring_task {
atomic_t inflight_tracked;
struct percpu_counter inflight;
+ /* drains ->task_list once the task can no longer run task_work */
+ struct work_struct fallback_work;
+
struct { /* task_work */
- struct llist_head task_list;
+ struct mpscq task_list;
+ /* BIT(0) guards adding tw only once */
+ unsigned long tw_pending;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
@@ -290,6 +312,8 @@ enum {
IO_RING_F_IOWQ_LIMITS_SET = BIT(12),
};
+struct iou_ctx {};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -346,6 +370,14 @@ struct io_ring_ctx {
bool poll_multi_queue;
struct list_head iopoll_list;
+ /*
+ * Consumer cursor for ->work_list, protected by ->uring_lock.
+ * Deliberately kept away from the producer side of the queue,
+ * as it's written for every popped entry, and the producer
+ * cacheline is contended enough as it is.
+ */
+ struct llist_node *work_head;
+
struct io_file_table file_table;
struct io_rsrc_data buf_table;
struct io_alloc_cache node_cache;
@@ -366,7 +398,7 @@ struct io_ring_ctx {
struct io_alloc_cache rw_cache;
struct io_alloc_cache cmd_cache;
- int (*loop_step)(struct io_ring_ctx *ctx,
+ int (*loop_step)(struct iou_ctx *,
struct iou_loop_params *);
/*
@@ -403,8 +435,7 @@ struct io_ring_ctx {
*/
struct {
struct io_rings __rcu *rings_rcu;
- struct llist_head work_llist;
- struct llist_head retry_llist;
+ struct mpscq work_list;
unsigned long check_cq;
atomic_t cq_wait_nr;
atomic_t cq_timeouts;
@@ -446,6 +477,9 @@ struct io_ring_ctx {
/* Stores zcrx object pointers of type struct io_zcrx_ifq */
struct xarray zcrx_ctxs;
+ /* Used for accounting references on pages in registered buffers */
+ struct xarray hpage_acct;
+
u32 pers_next;
struct xarray personalities;
@@ -464,8 +498,6 @@ struct io_ring_ctx {
struct mutex tctx_lock;
/* ctx exit and cancelation */
- struct llist_head fallback_llist;
- struct delayed_work fallback_work;
struct work_struct exit_work;
struct completion ref_comp;
@@ -725,8 +757,6 @@ struct io_kiocb {
*/
u16 buf_index;
- unsigned nr_tw;
-
/* REQ_F_* flags */
io_req_flags_t flags;