summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 02:22:30 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 02:22:30 +0300
commit23acda7c221a76ff711d65f4ca90029d43b249a0 (patch)
tree3e7745c9210489864e153990c06833d7d47a3dcd /include
parent7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (diff)
parentc5e9f6a96bf7379da87df1b852b90527e242b56f (diff)
downloadlinux-23acda7c221a76ff711d65f4ca90029d43b249a0.tar.xz
Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Add a callback driven main loop for io_uring, and BPF struct_ops on top to allow implementing custom event loop logic - Decouple IOPOLL from being a ring-wide all-or-nothing setting, allowing IOPOLL use cases to also issue certain white listed non-polled opcodes - Timeout improvements. Migrate internal timeout storage from timespec64 to ktime_t for simpler arithmetic and avoid copying of timespec data - Zero-copy receive (zcrx) updates: - Add a device-less mode (ZCRX_REG_NODEV) for testing and experimentation where data flows through the copy fallback path - Fix two-step unregistration regression, DMA length calculations, xarray mark usage, and a potential 32-bit overflow in id shifting - Refactoring toward multi-area support: dedicated refill queue struct, consolidated DMA syncing, netmem array refilling format, and guard-based locking - Zero-copy transmit (zctx) cleanup: - Unify io_send_zc() and io_sendmsg_zc() into a single function - Add vectorized registered buffer send for IORING_OP_SEND_ZC - Add separate notification user_data via sqe->addr3 so notification and completion CQEs can be distinguished without extra reference counting - Switch struct io_ring_ctx internal bitfields to explicit flag bits with atomic-safe accessors, and annotate the known harmless races on those flags - Various optimizations caching ctx and other request fields in local variables to avoid repeated loads, and cleanups for tctx setup, ring fd registration, and read path early returns * tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits) io_uring: unify getting ctx from passed in file descriptor io_uring/register: don't get a reference to the registered ring fd io_uring/tctx: clean up __io_uring_add_tctx_node() error handling io_uring/tctx: have io_uring_alloc_task_context() return tctx io_uring/timeout: use 'ctx' consistently io_uring/rw: clean up __io_read() obsolete comment and early returns io_uring/zcrx: use correct mmap off constants io_uring/zcrx: use dma_len for chunk size calculation io_uring/zcrx: don't clear not allocated niovs io_uring/zcrx: don't use mark0 for allocating xarray io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring() io_uring/zcrx: reject REG_NODEV with large rx_buf_size io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP io_uring/rsrc: use io_cache_free() to free node io_uring/zcrx: rename zcrx [un]register functions io_uring/zcrx: check ctrl op payload struct sizes io_uring/zcrx: cache fallback availability in zcrx ctx io_uring/zcrx: warn on a repeated area append io_uring/zcrx: consolidate dma syncing io_uring/zcrx: netmem array as refiling format ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/io_uring_types.h47
-rw-r--r--include/uapi/linux/io_uring.h101
-rw-r--r--include/uapi/linux/io_uring/zcrx.h115
3 files changed, 155 insertions, 108 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 214fdbd49052..244392026c6d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -8,6 +8,9 @@
#include <linux/llist.h>
#include <uapi/linux/io_uring.h>
+struct iou_loop_params;
+struct io_uring_bpf_ops;
+
enum {
/*
* A hint to not wake right away but delay until there are enough of
@@ -41,6 +44,8 @@ enum io_uring_cmd_flags {
IO_URING_F_COMPAT = (1 << 12),
};
+struct iou_loop_params;
+
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -268,24 +273,30 @@ struct io_alloc_cache {
unsigned int init_clear;
};
+enum {
+ IO_RING_F_DRAIN_NEXT = BIT(0),
+ IO_RING_F_OP_RESTRICTED = BIT(1),
+ IO_RING_F_REG_RESTRICTED = BIT(2),
+ IO_RING_F_OFF_TIMEOUT_USED = BIT(3),
+ IO_RING_F_DRAIN_ACTIVE = BIT(4),
+ IO_RING_F_HAS_EVFD = BIT(5),
+ /* all CQEs should be posted only by the submitter task */
+ IO_RING_F_TASK_COMPLETE = BIT(6),
+ IO_RING_F_LOCKLESS_CQ = BIT(7),
+ IO_RING_F_SYSCALL_IOPOLL = BIT(8),
+ IO_RING_F_POLL_ACTIVATED = BIT(9),
+ IO_RING_F_DRAIN_DISABLED = BIT(10),
+ IO_RING_F_COMPAT = BIT(11),
+ IO_RING_F_IOWQ_LIMITS_SET = BIT(12),
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
+ /* ring setup flags */
unsigned int flags;
- unsigned int drain_next: 1;
- unsigned int op_restricted: 1;
- unsigned int reg_restricted: 1;
- unsigned int off_timeout_used: 1;
- unsigned int drain_active: 1;
- unsigned int has_evfd: 1;
- /* all CQEs should be posted only by the submitter task */
- unsigned int task_complete: 1;
- unsigned int lockless_cq: 1;
- unsigned int syscall_iopoll: 1;
- unsigned int poll_activated: 1;
- unsigned int drain_disabled: 1;
- unsigned int compat: 1;
- unsigned int iowq_limits_set : 1;
+ /* internal state flags IO_RING_F_* flags , mostly read-only */
+ unsigned int int_flags;
struct task_struct *submitter_task;
struct io_rings *rings;
@@ -355,6 +366,9 @@ struct io_ring_ctx {
struct io_alloc_cache rw_cache;
struct io_alloc_cache cmd_cache;
+ int (*loop_step)(struct io_ring_ctx *ctx,
+ struct iou_loop_params *);
+
/*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -477,6 +491,8 @@ struct io_ring_ctx {
DECLARE_HASHTABLE(napi_ht, 4);
#endif
+ struct io_uring_bpf_ops *bpf_ops;
+
/*
* Protection for resize vs mmap races - both the mmap and resize
* side will need to grab this lock, to prevent either side from
@@ -545,6 +561,7 @@ enum {
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,
REQ_F_SQE_COPIED_BIT,
+ REQ_F_IOPOLL_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -638,6 +655,8 @@ enum {
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
/* ->sqe_copy() has been called, if necessary */
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
+ /* request must be iopolled to completion (set in ->issue()) */
+ REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT),
};
struct io_tw_req {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1ff16141c8a5..17ac1b785440 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -10,6 +10,8 @@
#include <linux/fs.h>
#include <linux/types.h>
+#include <linux/io_uring/zcrx.h>
+
/*
* this file is shared with liburing and that has to autodetect
* if linux/time_types.h is available or not, it can
@@ -341,6 +343,10 @@ enum io_uring_op {
/*
* sqe->timeout_flags
+ *
+ * IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout
+ * value in nanoseconds instead of
+ * pointing to a timespec.
*/
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1)
@@ -349,6 +355,7 @@ enum io_uring_op {
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
+#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
@@ -1050,100 +1057,6 @@ struct io_timespec {
__u64 tv_nsec;
};
-/* Zero copy receive refill queue entry */
-struct io_uring_zcrx_rqe {
- __u64 off;
- __u32 len;
- __u32 __pad;
-};
-
-struct io_uring_zcrx_cqe {
- __u64 off;
- __u64 __pad;
-};
-
-/* The bit from which area id is encoded into offsets */
-#define IORING_ZCRX_AREA_SHIFT 48
-#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
-
-struct io_uring_zcrx_offsets {
- __u32 head;
- __u32 tail;
- __u32 rqes;
- __u32 __resv2;
- __u64 __resv[2];
-};
-
-enum io_uring_zcrx_area_flags {
- IORING_ZCRX_AREA_DMABUF = 1,
-};
-
-struct io_uring_zcrx_area_reg {
- __u64 addr;
- __u64 len;
- __u64 rq_area_token;
- __u32 flags;
- __u32 dmabuf_fd;
- __u64 __resv2[2];
-};
-
-enum zcrx_reg_flags {
- ZCRX_REG_IMPORT = 1,
-};
-
-enum zcrx_features {
- /*
- * The user can ask for the desired rx page size by passing the
- * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
- */
- ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
-};
-
-/*
- * Argument for IORING_REGISTER_ZCRX_IFQ
- */
-struct io_uring_zcrx_ifq_reg {
- __u32 if_idx;
- __u32 if_rxq;
- __u32 rq_entries;
- __u32 flags;
-
- __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
- __u64 region_ptr; /* struct io_uring_region_desc * */
-
- struct io_uring_zcrx_offsets offsets;
- __u32 zcrx_id;
- __u32 rx_buf_len;
- __u64 __resv[3];
-};
-
-enum zcrx_ctrl_op {
- ZCRX_CTRL_FLUSH_RQ,
- ZCRX_CTRL_EXPORT,
-
- __ZCRX_CTRL_LAST,
-};
-
-struct zcrx_ctrl_flush_rq {
- __u64 __resv[6];
-};
-
-struct zcrx_ctrl_export {
- __u32 zcrx_fd;
- __u32 __resv1[11];
-};
-
-struct zcrx_ctrl {
- __u32 zcrx_id;
- __u32 op; /* see enum zcrx_ctrl_op */
- __u64 __resv[2];
-
- union {
- struct zcrx_ctrl_export zc_export;
- struct zcrx_ctrl_flush_rq zc_flush;
- };
-};
-
#ifdef __cplusplus
}
#endif
diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h
new file mode 100644
index 000000000000..5ce02c7a6096
--- /dev/null
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
+/*
+ * Header file for the io_uring zerocopy receive (zcrx) interface.
+ *
+ * Copyright (C) 2026 Pavel Begunkov
+ * Copyright (C) 2026 David Wei
+ * Copyright (C) Meta Platforms, Inc.
+ */
+#ifndef LINUX_IO_ZCRX_H
+#define LINUX_IO_ZCRX_H
+
+#include <linux/types.h>
+
+/* Zero copy receive refill queue entry */
+struct io_uring_zcrx_rqe {
+ __u64 off;
+ __u32 len;
+ __u32 __pad;
+};
+
+struct io_uring_zcrx_cqe {
+ __u64 off;
+ __u64 __pad;
+};
+
+/* The bit from which area id is encoded into offsets */
+#define IORING_ZCRX_AREA_SHIFT 48
+#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
+
+struct io_uring_zcrx_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 rqes;
+ __u32 __resv2;
+ __u64 __resv[2];
+};
+
+enum io_uring_zcrx_area_flags {
+ IORING_ZCRX_AREA_DMABUF = 1,
+};
+
+struct io_uring_zcrx_area_reg {
+ __u64 addr;
+ __u64 len;
+ __u64 rq_area_token;
+ __u32 flags;
+ __u32 dmabuf_fd;
+ __u64 __resv2[2];
+};
+
+enum zcrx_reg_flags {
+ ZCRX_REG_IMPORT = 1,
+
+ /*
+ * Register a zcrx instance without a net device. All data will be
+ * copied. The refill queue entries might not be automatically
+ * consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ.
+ */
+ ZCRX_REG_NODEV = 2,
+};
+
+enum zcrx_features {
+ /*
+ * The user can ask for the desired rx page size by passing the
+ * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
+ */
+ ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
+};
+
+/*
+ * Argument for IORING_REGISTER_ZCRX_IFQ
+ */
+struct io_uring_zcrx_ifq_reg {
+ __u32 if_idx;
+ __u32 if_rxq;
+ __u32 rq_entries;
+ __u32 flags;
+
+ __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
+ __u64 region_ptr; /* struct io_uring_region_desc * */
+
+ struct io_uring_zcrx_offsets offsets;
+ __u32 zcrx_id;
+ __u32 rx_buf_len;
+ __u64 __resv[3];
+};
+
+enum zcrx_ctrl_op {
+ ZCRX_CTRL_FLUSH_RQ,
+ ZCRX_CTRL_EXPORT,
+
+ __ZCRX_CTRL_LAST,
+};
+
+struct zcrx_ctrl_flush_rq {
+ __u64 __resv[6];
+};
+
+struct zcrx_ctrl_export {
+ __u32 zcrx_fd;
+ __u32 __resv1[11];
+};
+
+struct zcrx_ctrl {
+ __u32 zcrx_id;
+ __u32 op; /* see enum zcrx_ctrl_op */
+ __u64 __resv[2];
+
+ union {
+ struct zcrx_ctrl_export zc_export;
+ struct zcrx_ctrl_flush_rq zc_flush;
+ };
+};
+
+#endif /* LINUX_IO_ZCRX_H */