From 4897f9b7f8bdcf93b8d3b466321fa00bb6d2e600 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:11 +0200 Subject: ethtool: Add support for 200Gbps per lane link modes Define 200G, 400G and 800G link modes using 200Gbps per lane. Signed-off-by: Jianbo Liu Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- include/uapi/linux/ethtool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1089b88efc7..e0bd726f84c1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2057,6 +2057,24 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, ETHTOOL_LINK_MODE_10baseT1BRR_Full_BIT = 102, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT = 103, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT = 104, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT = 105, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT = 106, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT = 107, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT = 108, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT = 109, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT = 110, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT = 111, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT = 112, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT = 113, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT = 114, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT = 115, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT = 116, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT = 117, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT = 118, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT = 119, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT = 120, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS -- cgit v1.2.3 From dcc0113acd3b77cca3c7e805fffd8ea4c5a675b3 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:16 -0800 Subject: netdev: add io_uring memory provider info Add a nested attribute for io_uring memory provider info. For now it is empty and its presence indicates that a particular page pool or queue has an io_uring memory provider attached. $ ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get [{'id': 80, 'ifindex': 2, 'inflight': 64, 'inflight-mem': 262144, 'napi-id': 525}, {'id': 79, 'ifindex': 2, 'inflight': 320, 'inflight-mem': 1310720, 'io_uring': {}, 'napi-id': 525}, ... $ ./cli.py --spec netlink/specs/netdev.yaml --dump queue-get [{'id': 0, 'ifindex': 1, 'type': 'rx'}, {'id': 0, 'ifindex': 1, 'type': 'tx'}, {'id': 0, 'ifindex': 2, 'napi-id': 513, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 514, 'type': 'rx'}, ... {'id': 12, 'ifindex': 2, 'io_uring': {}, 'napi-id': 525, 'type': 'rx'}, ... Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-6-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 15 +++++++++++++++ include/uapi/linux/netdev.h | 7 +++++++ tools/include/uapi/linux/netdev.h | 7 +++++++ 3 files changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index cbb544bd6c84..288923e965ae 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -114,6 +114,9 @@ attribute-sets: doc: Bitmask of enabled AF_XDP features. type: u64 enum: xsk-flags + - + name: io-uring-provider-info + attributes: [] - name: page-pool attributes: @@ -171,6 +174,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf this page-pool is attached to. type: u32 + - + name: io-uring + doc: io-uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: page-pool-info subset-of: page-pool @@ -296,6 +304,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf attached to this queue, if any. type: u32 + - + name: io-uring + doc: io_uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: qstats @@ -572,6 +585,7 @@ operations: - inflight-mem - detach-time - dmabuf + - io-uring dump: reply: *pp-reply config-cond: page-pool @@ -637,6 +651,7 @@ operations: - napi-id - ifindex - dmabuf + - io-uring dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) -- cgit v1.2.3 From 6f377873cb23905009759b7366b9fe85c2a6ff37 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 14 Feb 2025 16:09:36 -0800 Subject: io_uring/zcrx: add interface queue and refill queue Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe Signed-off-by: David Wei Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk Signed-off-by: Jens Axboe --- Kconfig | 2 + include/linux/io_uring_types.h | 6 ++ include/uapi/linux/io_uring.h | 43 +++++++++++- io_uring/KConfig | 10 +++ io_uring/Makefile | 1 + io_uring/io_uring.c | 7 ++ io_uring/memmap.h | 1 + io_uring/register.c | 7 ++ io_uring/zcrx.c | 149 +++++++++++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 35 ++++++++++ 10 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 io_uring/KConfig create mode 100644 io_uring/zcrx.c create mode 100644 io_uring/zcrx.h (limited to 'include/uapi/linux') diff --git a/Kconfig b/Kconfig index 745bc773f567..529ea7694ba9 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "io_uring/KConfig" diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 123e69368730..c0fe8a00fe53 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -382,6 +384,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; @@ -434,6 +438,8 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; + /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ + struct io_mapped_region zcrx_region; }; /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e11c82638527..6a1632d0fba1 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -639,7 +639,8 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, - /* 32 reserved for zc rx */ + /* register a netdev hw rx queue for zerocopy */ + IORING_REGISTER_ZCRX_IFQ = 32, /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, @@ -956,6 +957,46 @@ enum io_uring_socket_op { SOCKET_URING_OP_SETSOCKOPT, }; +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u64 __resv[4]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/KConfig b/io_uring/KConfig new file mode 100644 index 000000000000..9e2a4beba1ef --- /dev/null +++ b/io_uring/KConfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index d695b60dba4f..98e48339d84d 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b688953d1de8..58528bf61638 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); @@ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/memmap.h b/io_uring/memmap.h index c898dcba2b4e..dad0aa5b1b45 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -2,6 +2,7 @@ #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL +#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/register.c b/io_uring/register.c index 9a4d2fbce4ae..cc23a4c205cd 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -30,6 +30,7 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..f3ace7e8264d --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" + +#define IO_RQ_MAX_ENTRIES 32768 + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd) +{ + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, + IORING_MAP_OFF_ZCRX_REGION); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + return ifq; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + int ret; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (ctx->ifq) + return -EBUSY; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (!reg.area_ptr) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®, &rd); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + ifq->if_rxq = reg.if_rxq; + + reg.offsets.rqes = sizeof(struct io_uring); + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { + ret = -EFAULT; + goto err; + } + + ctx->ifq = ifq; + return 0; +err: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zcrx_ifq_free(ifq); +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..58e4ab6c6083 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 rq_entries; + + u32 if_rxq; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +#endif + +#endif -- cgit v1.2.3 From cf96310c5f9a0d542db99c887742811425ba2ec0 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 14 Feb 2025 16:09:37 -0800 Subject: io_uring/zcrx: add io_zcrx_area Add io_zcrx_area that represents a region of userspace memory that is used for zero copy. During ifq registration, userspace passes in the uaddr and len of userspace memory, which is then pinned by the kernel. Each net_iov is mapped to one of these pages. The freelist is a spinlock protected list that keeps track of all the net_iovs/pages that aren't used. For now, there is only one area per ifq and area registration happens implicitly as part of ifq registration. There is no API for adding/removing areas yet. The struct for area registration is there for future extensibility once we support multiple areas and TCP devmem. Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20250215000947.789731-3-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 +++++ io_uring/rsrc.c | 2 +- io_uring/rsrc.h | 1 + io_uring/zcrx.c | 89 ++++++++++++++++++++++++++++++++++++++++++- io_uring/zcrx.h | 16 ++++++++ 5 files changed, 114 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6a1632d0fba1..44844707d327 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -981,6 +981,15 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 __resv1; + __u64 __resv2[2]; +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index af39b69eb4fd..20b884c84e55 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a6d883c62b22..abf86b5b8614 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -68,6 +68,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_buffer_validate(struct iovec *iov); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index f3ace7e8264d..04883a3ae80c 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -10,6 +10,7 @@ #include "kbuf.h" #include "memmap.h" #include "zcrx.h" +#include "rsrc.h" #define IO_RQ_MAX_ENTRIES 32768 @@ -44,6 +45,79 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) ifq->rqes = NULL; } +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + kvfree(area->freelist); + kvfree(area->nia.niovs); + if (area->pages) { + unpin_user_pages(area->pages, area->nia.num_niovs); + kvfree(area->pages); + } + kfree(area); +} + +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + int i, ret, nr_pages; + struct iovec iov; + + if (area_reg->flags || area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + iov.iov_base = u64_to_user_ptr(area_reg->addr); + iov.iov_len = area_reg->len; + ret = io_buffer_validate(&iov); + if (ret) + return ret; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(area->pages)) { + ret = PTR_ERR(area->pages); + area->pages = NULL; + goto err; + } + area->nia.num_niovs = nr_pages; + + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + for (i = 0; i < nr_pages; i++) + area->freelist[i] = i; + + area->free_count = nr_pages; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -59,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { + if (ifq->area) + io_zcrx_free_area(ifq->area); + io_free_rbuf_ring(ifq); kfree(ifq); } @@ -66,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { + struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; @@ -99,7 +177,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } reg.rq_entries = roundup_pow_of_two(reg.rq_entries); - if (!reg.area_ptr) + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) return -EFAULT; ifq = io_zcrx_ifq_alloc(ctx); @@ -110,6 +188,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + ifq->rq_entries = reg.rq_entries; ifq->if_rxq = reg.if_rxq; @@ -122,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - + if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } ctx->ifq = ifq; return 0; err: diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 58e4ab6c6083..53fd94b65b38 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,9 +3,25 @@ #define IOU_ZC_RX_H #include +#include + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + + u16 area_id; + struct page **pages; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; +}; struct io_zcrx_ifq { struct io_ring_ctx *ctx; + struct io_zcrx_area *area; + struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries; -- cgit v1.2.3 From 11ed914bbf948c4a37248f2876973ac18014056d Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 14 Feb 2025 16:09:41 -0800 Subject: io_uring/zcrx: add io_recvzc request Add io_uring opcode OP_RECV_ZC for doing zero copy reads out of a socket. Only the connection should be land on the specific rx queue set up for zero copy, and the socket must be handled by the io_uring instance that the rx queue was registered for zero copy with. That's because neither net_iovs / buffers from our queue can be read by outside applications, nor zero copy is possible if traffic for the zero copy connection goes to another queue. This coordination is outside of the scope of this patch series. Also, any traffic directed to the zero copy enabled queue is immediately visible to the application, which is why CAP_NET_ADMIN is required at the registration step. Of course, no data is actually read out of the socket, it has already been copied by the netdev into userspace memory via DMA. OP_RECV_ZC reads skbs out of the socket and checks that its frags are indeed net_iovs that belong to io_uring. A cqe is queued for each one of these frags. Recall that each cqe is a big cqe, with the top half being an io_uring_zcrx_cqe. The cqe res field contains the len or error. The lower IORING_ZCRX_AREA_SHIFT bits of the struct io_uring_zcrx_cqe::off field contain the offset relative to the start of the zero copy area. The upper part of the off field is trivially zero, and will be used to carry the area id. For now, there is no limit as to how much work each OP_RECV_ZC request does. It will attempt to drain a socket of all available data. This request always operates in multishot mode. Reviewed-by: Jens Axboe Signed-off-by: David Wei Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20250215000947.789731-7-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 + io_uring/io_uring.h | 10 +++ io_uring/net.c | 72 ++++++++++++++++ io_uring/opdef.c | 16 ++++ io_uring/zcrx.c | 190 +++++++++++++++++++++++++++++++++++++++++- io_uring/zcrx.h | 13 +++ 6 files changed, 302 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 44844707d327..05d6255b0f6a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -87,6 +87,7 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + __u32 zcrx_ifq_idx; __u32 optlen; struct { __u16 addr_len; @@ -278,6 +279,7 @@ enum io_uring_op { IORING_OP_FTRUNCATE, IORING_OP_BIND, IORING_OP_LISTEN, + IORING_OP_RECV_ZC, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6c46d9cdd7aa..650f81dac5d0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -185,6 +185,16 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret return io_get_cqe_overflow(ctx, ret, false); } +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->cq_extra++; + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret); +} + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { diff --git a/io_uring/net.c b/io_uring/net.c index 10344b3a6d89..260eb73a5854 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -16,6 +16,7 @@ #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -89,6 +90,13 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + unsigned msg_flags; + u16 flags; + struct io_zcrx_ifq *ifq; +}; + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -1227,6 +1235,70 @@ out_free: return ret; } +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || + sqe->len || sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + if (ifq_idx != 0) + return -EINVAL; + zc->ifq = req->ctx->ifq; + if (!zc->ifq) + return -EINVAL; + + zc->flags = READ_ONCE(sqe->ioprio); + zc->msg_flags = READ_ONCE(sqe->msg_flags); + if (zc->msg_flags) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, + issue_flags); + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; +} + void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e8baef4e5146..89f50ecadeaf 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -37,6 +37,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -514,6 +515,18 @@ const struct io_issue_def io_issue_defs[] = { .async_size = sizeof(struct io_async_msghdr), #else .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, #endif }, }; @@ -745,6 +758,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 9d14fdf7a568..8833879d94ba 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -13,6 +13,8 @@ #include #include +#include +#include #include @@ -91,7 +93,12 @@ static void io_zcrx_sync_for_device(const struct page_pool *pool, #define IO_RQ_MAX_ENTRIES 32768 -__maybe_unused +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; +}; + static const struct memory_provider_ops io_uring_pp_zc_ops; static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) @@ -118,6 +125,11 @@ static bool io_zcrx_put_niov_uref(struct net_iov *niov) return true; } +static void io_zcrx_get_niov_uref(struct net_iov *niov) +{ + atomic_inc(io_get_user_counter(niov)); +} + static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -614,3 +626,179 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .nl_fill = io_pp_nl_fill, .uninstall = io_pp_uninstall, }; + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + + if (unlikely(!skb_frag_is_net_iov(frag))) + return -EOPNOTSUPP; + + niov = netmem_to_net_iov(frag->netmem); + if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + niov->pp->mp_priv != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) + return -ENOSPC; + + /* + * Prevent it from being recycled while user is accessing it. + * It has to be done before grabbing a user reference. + */ + page_pool_ref_netmem(net_iov_to_netmem(niov)); + io_zcrx_get_niov_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off; + int i, copy, end, off; + int ret = 0; + + start = skb_headlen(skb); + start_off = offset; + + if (offset < start) + return -EOPNOTSUPP; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned issue_flags) +{ + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = 1, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 1b6363591f72..a16bdd921f03 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,6 +3,7 @@ #define IOU_ZC_RX_H #include +#include #include #include @@ -43,6 +44,9 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) @@ -55,6 +59,15 @@ static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { } +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags) +{ + return -EOPNOTSUPP; +} #endif +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + #endif -- cgit v1.2.3