diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 02:22:30 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 02:22:30 +0300 |
| commit | 23acda7c221a76ff711d65f4ca90029d43b249a0 (patch) | |
| tree | 3e7745c9210489864e153990c06833d7d47a3dcd /include | |
| parent | 7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (diff) | |
| parent | c5e9f6a96bf7379da87df1b852b90527e242b56f (diff) | |
| download | linux-23acda7c221a76ff711d65f4ca90029d43b249a0.tar.xz | |
Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Add a callback driven main loop for io_uring, and BPF struct_ops
on top to allow implementing custom event loop logic
- Decouple IOPOLL from being a ring-wide all-or-nothing setting,
allowing IOPOLL use cases to also issue certain white listed
non-polled opcodes
- Timeout improvements. Migrate internal timeout storage from
timespec64 to ktime_t for simpler arithmetic and avoid copying of
timespec data
- Zero-copy receive (zcrx) updates:
- Add a device-less mode (ZCRX_REG_NODEV) for testing and
experimentation where data flows through the copy fallback path
- Fix two-step unregistration regression, DMA length calculations,
xarray mark usage, and a potential 32-bit overflow in id
shifting
- Refactoring toward multi-area support: dedicated refill queue
struct, consolidated DMA syncing, netmem array refilling format,
and guard-based locking
- Zero-copy transmit (zctx) cleanup:
- Unify io_send_zc() and io_sendmsg_zc() into a single function
- Add vectorized registered buffer send for IORING_OP_SEND_ZC
- Add separate notification user_data via sqe->addr3 so
notification and completion CQEs can be distinguished without
extra reference counting
- Switch struct io_ring_ctx internal bitfields to explicit flag bits
with atomic-safe accessors, and annotate the known harmless races on
those flags
- Various optimizations caching ctx and other request fields in local
variables to avoid repeated loads, and cleanups for tctx setup, ring
fd registration, and read path early returns
* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
io_uring: unify getting ctx from passed in file descriptor
io_uring/register: don't get a reference to the registered ring fd
io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
io_uring/tctx: have io_uring_alloc_task_context() return tctx
io_uring/timeout: use 'ctx' consistently
io_uring/rw: clean up __io_read() obsolete comment and early returns
io_uring/zcrx: use correct mmap off constants
io_uring/zcrx: use dma_len for chunk size calculation
io_uring/zcrx: don't clear not allocated niovs
io_uring/zcrx: don't use mark0 for allocating xarray
io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
io_uring/zcrx: reject REG_NODEV with large rx_buf_size
io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
io_uring/rsrc: use io_cache_free() to free node
io_uring/zcrx: rename zcrx [un]register functions
io_uring/zcrx: check ctrl op payload struct sizes
io_uring/zcrx: cache fallback availability in zcrx ctx
io_uring/zcrx: warn on a repeated area append
io_uring/zcrx: consolidate dma syncing
io_uring/zcrx: netmem array as refiling format
...
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/io_uring_types.h | 47 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring.h | 101 | ||||
| -rw-r--r-- | include/uapi/linux/io_uring/zcrx.h | 115 |
3 files changed, 155 insertions, 108 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 214fdbd49052..244392026c6d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include <linux/llist.h> #include <uapi/linux/io_uring.h> +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -41,6 +44,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -268,24 +273,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; @@ -355,6 +366,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() @@ -477,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from @@ -545,6 +561,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -638,6 +655,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ff16141c8a5..17ac1b785440 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,6 +10,8 @@ #include <linux/fs.h> #include <linux/types.h> +#include <linux/io_uring/zcrx.h> + /* * this file is shared with liburing and that has to autodetect * if linux/time_types.h is available or not, it can @@ -341,6 +343,10 @@ enum io_uring_op { /* * sqe->timeout_flags + * + * IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout + * value in nanoseconds instead of + * pointing to a timespec. */ #define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_UPDATE (1U << 1) @@ -349,6 +355,7 @@ enum io_uring_op { #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_MULTISHOT (1U << 6) +#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* @@ -1050,100 +1057,6 @@ struct io_timespec { __u64 tv_nsec; }; -/* Zero copy receive refill queue entry */ -struct io_uring_zcrx_rqe { - __u64 off; - __u32 len; - __u32 __pad; -}; - -struct io_uring_zcrx_cqe { - __u64 off; - __u64 __pad; -}; - -/* The bit from which area id is encoded into offsets */ -#define IORING_ZCRX_AREA_SHIFT 48 -#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) - -struct io_uring_zcrx_offsets { - __u32 head; - __u32 tail; - __u32 rqes; - __u32 __resv2; - __u64 __resv[2]; -}; - -enum io_uring_zcrx_area_flags { - IORING_ZCRX_AREA_DMABUF = 1, -}; - -struct io_uring_zcrx_area_reg { - __u64 addr; - __u64 len; - __u64 rq_area_token; - __u32 flags; - __u32 dmabuf_fd; - __u64 __resv2[2]; -}; - -enum zcrx_reg_flags { - ZCRX_REG_IMPORT = 1, -}; - -enum zcrx_features { - /* - * The user can ask for the desired rx page size by passing the - * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. - */ - ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, -}; - -/* - * Argument for IORING_REGISTER_ZCRX_IFQ - */ -struct io_uring_zcrx_ifq_reg { - __u32 if_idx; - __u32 if_rxq; - __u32 rq_entries; - __u32 flags; - - __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ - __u64 region_ptr; /* struct io_uring_region_desc * */ - - struct io_uring_zcrx_offsets offsets; - __u32 zcrx_id; - __u32 rx_buf_len; - __u64 __resv[3]; -}; - -enum zcrx_ctrl_op { - ZCRX_CTRL_FLUSH_RQ, - ZCRX_CTRL_EXPORT, - - __ZCRX_CTRL_LAST, -}; - -struct zcrx_ctrl_flush_rq { - __u64 __resv[6]; -}; - -struct zcrx_ctrl_export { - __u32 zcrx_fd; - __u32 __resv1[11]; -}; - -struct zcrx_ctrl { - __u32 zcrx_id; - __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[2]; - - union { - struct zcrx_ctrl_export zc_export; - struct zcrx_ctrl_flush_rq zc_flush; - }; -}; - #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/zcrx.h b/include/uapi/linux/io_uring/zcrx.h new file mode 100644 index 000000000000..5ce02c7a6096 --- /dev/null +++ b/include/uapi/linux/io_uring/zcrx.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring zerocopy receive (zcrx) interface. + * + * Copyright (C) 2026 Pavel Begunkov + * Copyright (C) 2026 David Wei + * Copyright (C) Meta Platforms, Inc. + */ +#ifndef LINUX_IO_ZCRX_H +#define LINUX_IO_ZCRX_H + +#include <linux/types.h> + +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +enum io_uring_zcrx_area_flags { + IORING_ZCRX_AREA_DMABUF = 1, +}; + +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 dmabuf_fd; + __u64 __resv2[2]; +}; + +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, + + /* + * Register a zcrx instance without a net device. All data will be + * copied. The refill queue entries might not be automatically + * consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ. + */ + ZCRX_REG_NODEV = 2, +}; + +enum zcrx_features { + /* + * The user can ask for the desired rx page size by passing the + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. + */ + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + +enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, + + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[2]; + + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; +}; + +#endif /* LINUX_IO_ZCRX_H */ |
