From fdae0ab67d57d480dc61e9fb45678bbdc3786711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 12:19:42 +0000 Subject: net: use NUMA drop counters for softnet_data.dropped Hosts under DOS attack can suffer from false sharing in enqueue_to_backlog() : atomic_inc(&sd->dropped). This is because sd->dropped can be touched from many cpus, possibly residing on different NUMA nodes. Generalize the sk_drop_counters infrastucture added in commit c51613fa276f ("net: add sk->sk_drop_counters") and use it to replace softnet_data.dropped with NUMA friendly softnet_data.drop_counters. This adds 64 bytes per cpu, maybe more in the future if we increase the number of counters (currently 2) per 'struct numa_drop_counters'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250909121942.1202585-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3a3b761abfb..f5a840c07cf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3459,6 +3459,32 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } +struct numa_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + +static inline int numa_drop_read(const struct numa_drop_counters *ndc) +{ + return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); +} + +static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) +{ + int n = numa_node_id() % 2; + + if (n) + atomic_add(val, &ndc->drops1); + else + atomic_add(val, &ndc->drops0); +} + +static inline void numa_drop_reset(struct numa_drop_counters *ndc) +{ + atomic_set(&ndc->drops0, 0); + atomic_set(&ndc->drops1, 0); +} + /* * Incoming packets are placed on per-CPU queues */ @@ -3504,7 +3530,7 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; - atomic_t dropped ____cacheline_aligned_in_smp; + struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include/linux/netdevice.h') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From 3271f19bf7b9df665549666d789b9f126b4420c7 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:06 +0200 Subject: net: gso: restore ids of outer ip headers correctly Currently, NETIF_F_TSO_MANGLEID indicates that the inner-most ID can be mangled. Outer IDs can always be mangled. Make GSO preserve outer IDs by default, with NETIF_F_TSO_MANGLEID allowing both inner and outer IDs to be mangled. This commit also modifies a few drivers that use SKB_GSO_FIXEDID directly. Signed-off-by: Richard Gobert Reviewed-by: Edward Cree # for sfc Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-4-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/segmentation-offloads.rst | 22 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++++++-- drivers/net/ethernet/sfc/ef100_tx.c | 17 +++++++++++++---- include/linux/netdevice.h | 9 +++++++-- include/linux/skbuff.h | 8 +++++++- net/core/dev.c | 10 ++++++++-- net/ipv4/af_inet.c | 13 ++++++------- net/ipv4/tcp_offload.c | 6 ++---- 8 files changed, 63 insertions(+), 30 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst index 085e8fab03fd..72f69b22b28c 100644 --- a/Documentation/networking/segmentation-offloads.rst +++ b/Documentation/networking/segmentation-offloads.rst @@ -43,10 +43,19 @@ also point to the TCP header of the packet. For IPv4 segmentation we support one of two types in terms of the IP ID. The default behavior is to increment the IP ID with every segment. If the GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP -ID and all segments will use the same IP ID. If a device has -NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO -and we will either increment the IP ID for all frames, or leave it at a -static value based on driver preference. +ID and all segments will use the same IP ID. + +For encapsulated packets, SKB_GSO_TCP_FIXEDID refers only to the outer header. +SKB_GSO_TCP_FIXEDID_INNER can be used to specify the same for the inner header. +Any combination of these two GSO types is allowed. + +If a device has NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when +performing TSO and we will either increment the IP ID for all frames, or leave +it at a static value based on driver preference. For encapsulated packets, +NETIF_F_TSO_MANGLEID is relevant for both outer and inner headers, unless the +DF bit is not set on the outer header, in which case the device driver must +guarantee that the IP ID field is incremented in the outer header with every +segment. UDP Fragmentation Offload @@ -124,10 +133,7 @@ Generic Receive Offload Generic receive offload is the complement to GSO. Ideally any frame assembled by GRO should be segmented to create an identical sequence of frames using GSO, and any sequence of frames segmented by GSO should be -able to be reassembled back to the original by GRO. The only exception to -this is IPv4 ID in the case that the DF bit is set for a given IP header. -If the value of the IPv4 ID is not sequentially incrementing it will be -altered so that it is when a frame assembled via GRO is segmented via GSO. +able to be reassembled back to the original by GRO. Partial Generic Segmentation Offload diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4ed43ee9aa35..263d5628ee44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1290,8 +1290,12 @@ static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr * tcp->check = ~tcp_v4_check(skb->len - tcp_off, ipv4->saddr, ipv4->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) { + bool encap = rq->hw_gro_data->fk.control.flags & FLOW_DIS_ENCAPSULATION; + + skb_shinfo(skb)->gso_type |= encap ? SKB_GSO_TCP_FIXEDID_INNER : + SKB_GSO_TCP_FIXEDID; + } skb->csum_start = (unsigned char *)tcp - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index e6b6be549581..03005757c060 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -189,6 +189,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, { bool gso_partial = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL; unsigned int len, ip_offset, tcp_offset, payload_segs; + u32 mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; u32 mangleid = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; unsigned int outer_ip_offset, outer_l4_offset; u16 vlan_tci = skb_vlan_tag_get(skb); @@ -200,8 +201,17 @@ static void ef100_make_tso_desc(struct efx_nic *efx, bool outer_csum; u32 paylen; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) - mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (encap) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID_INNER) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } else { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } + if (efx->net_dev->features & NETIF_F_HW_VLAN_CTAG_TX) vlan_enable = skb_vlan_tag_present(skb); @@ -245,8 +255,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, ESF_GZ_TX_TSO_OUTER_L4_OFF_W, outer_l4_offset >> 1, ESF_GZ_TX_TSO_ED_OUTER_UDP_LEN, udp_encap && !gso_partial, ESF_GZ_TX_TSO_ED_OUTER_IP_LEN, encap && !gso_partial, - ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, encap ? mangleid : - ESE_GZ_TX_DESC_IP4_ID_NO_OP, + ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, mangleid_outer, ESF_GZ_TX_TSO_VLAN_INSERT_EN, vlan_enable, ESF_GZ_TX_TSO_VLAN_INSERT_TCI, vlan_tci ); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c54d44805fa..1b85454116f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5320,13 +5320,18 @@ void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature; + + if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER)) + gso_type |= __SKB_GSO_TCP_FIXEDID; + + feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 78ecfa7d00d0..fb3fec9affaa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -674,7 +674,7 @@ enum { /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, - SKB_GSO_TCP_FIXEDID = 1 << 3, + __SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, @@ -707,6 +707,12 @@ enum { SKB_GSO_FRAGLIST = 1 << 18, SKB_GSO_TCP_ACCECN = 1 << 19, + + /* These indirectly map onto the same netdev feature. + * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs. + */ + SKB_GSO_TCP_FIXEDID = 1 << 30, + SKB_GSO_TCP_FIXEDID_INNER = 1 << 31, }; #if BITS_PER_LONG > 32 diff --git a/net/core/dev.c b/net/core/dev.c index fc4993526ead..8b54fdf0289a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e298dacb4a06..804c51296c55 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,14 +1395,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + /* fixed ID is invalid if DF bit is not set */ + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto out; - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 1949eede9ec9..2cb93da93abc 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,7 +471,6 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); - bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -485,10 +484,9 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; - + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (is_fixedid * SKB_GSO_TCP_FIXEDID); + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From 9c94ae6bb0b2895024b6e29fcc1cbec968b4776a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:32 +0000 Subject: net: make softnet_data.defer_count an atomic This is preparation work to remove the softnet_data.defer_lock, as it is contended on hosts with large number of cores. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b85454116f6..27e3fa69253f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3538,7 +3538,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; - int defer_count; + atomic_t defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; diff --git a/net/core/dev.c b/net/core/dev.c index 8b54fdf0289a..8566678d8344 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6726,7 +6726,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; - sd->defer_count = 0; + atomic_set(&sd->defer_count, 0); spin_unlock(&sd->defer_lock); while (skb != NULL) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 618afd59afff..16cd357d62a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7202,14 +7202,12 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + if (atomic_read(&sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ -- cgit v1.2.3 From 844c9db7f7f5fe1b0b53ed9f1c2bc7313b3021c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:33 +0000 Subject: net: use llist for sd->defer_list Get rid of sd->defer_lock and adopt llist operations. We optimize skb_attempt_defer_free() for the common case, where the packet is queued. Otherwise sd->defer_count is increasing, until skb_defer_free_flush() clears it. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 8 ++++---- net/core/dev.c | 18 ++++++------------ net/core/skbuff.c | 15 +++++++-------- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27e3fa69253f..5c9aa16933d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3537,10 +3537,10 @@ struct softnet_data { struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ - spinlock_t defer_lock ____cacheline_aligned_in_smp; - atomic_t defer_count; - int defer_ipi_scheduled; - struct sk_buff *defer_list; + struct llist_head defer_list ____cacheline_aligned_in_smp; + atomic_long_t defer_count; + + int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index 8566678d8344..fb67372774de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6717,22 +6717,16 @@ EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { + struct llist_node *free_list; struct sk_buff *skb, *next; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) + if (llist_empty(&sd->defer_list)) return; + atomic_long_set(&sd->defer_count, 0); + free_list = llist_del_all(&sd->defer_list); - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - atomic_set(&sd->defer_count, 0); - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { napi_consume_skb(skb, 1); - skb = next; } } @@ -12995,7 +12989,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); + init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16cd357d62a6..17455fc1e692 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,6 +7185,7 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + unsigned long defer_count; int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; @@ -7202,17 +7203,15 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (atomic_read(&sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sd->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); + llist_add(&skb->ll_node, &sd->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). -- cgit v1.2.3 From 5628f3fe3b16114e8424bbfcf0594caef8958a06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:34 +0000 Subject: net: add NUMA awareness to skb_attempt_defer_free() Instead of sharing sd->defer_list & sd->defer_count with many cpus, add one pair for each NUMA node. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ---- include/net/hotdata.h | 7 +++++++ net/core/dev.c | 35 +++++++++++++++++++++++------------ net/core/dev.h | 2 +- net/core/skbuff.c | 11 ++++++----- 5 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c9aa16933d1..d1a687444b27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3536,10 +3536,6 @@ struct softnet_data { struct numa_drop_counters drop_counters; - /* Another possibly contended cache line */ - struct llist_head defer_list ____cacheline_aligned_in_smp; - atomic_long_t defer_count; - int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index fda94b2647ff..4acec191c54a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,10 +2,16 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include #include #include #include +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + /* Read mostly data used in network fast paths. */ struct net_hotdata { #if IS_ENABLED(CONFIG_INET) @@ -30,6 +36,7 @@ struct net_hotdata { struct rps_sock_flow_table __rcu *rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index fb67372774de..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -6715,18 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - if (llist_empty(&sd->defer_list)) - return; - atomic_long_set(&sd->defer_count, 0); - free_list = llist_del_all(&sd->defer_list); + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; + + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - llist_for_each_entry_safe(skb, next, free_list, ll_node) { - napi_consume_skb(skb, 1); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6854,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7713,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7755,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12989,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12999,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17455fc1e692..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,9 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7201,14 +7201,15 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - defer_count = atomic_long_inc_return(&sd->defer_count); + defer_count = atomic_long_inc_return(&sdn->defer_count); if (defer_count >= defer_max) goto nodefer; - llist_add(&skb->ll_node, &sd->defer_list); + llist_add(&skb->ll_node, &sdn->defer_list); /* Send an IPI every time queue reaches half capacity. */ kick = (defer_count - 1) == (defer_max >> 1); @@ -7217,7 +7218,7 @@ nodefer: kfree_skb_napi_cache(skb); * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, -- cgit v1.2.3