From 3882dccf48f9fbe787b5df3187d708ef348ac860 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 8 Aug 2024 16:05:57 +0100 Subject: bpf/bpf_get,set_sockopt: add option to set TCP-BPF sock ops flags Currently the only opportunity to set sock ops flags dictating which callbacks fire for a socket is from within a TCP-BPF sockops program. This is problematic if the connection is already set up as there is no further chance to specify callbacks for that socket. Add TCP_BPF_SOCK_OPS_CB_FLAGS to bpf_setsockopt() and bpf_getsockopt() to allow users to specify callbacks later, either via an iterator over sockets or via a socket-specific program triggered by a setsockopt() on the socket. Previous discussion on this here [1]. [1] https://lore.kernel.org/bpf/f42f157b-6e52-dd4d-3d97-9b86c84c0b00@oracle.com/ Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20240808150558.1035626-2-alan.maguire@oracle.com Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..e05b39e39c3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2851,7 +2851,7 @@ union bpf_attr { * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, - * **TCP_BPF_RTO_MIN**. + * **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports the following *optname*\ s: * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. @@ -7080,6 +7080,7 @@ enum { TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ + TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ }; enum { -- cgit v1.2.3 From 65ab5ac4df012388481d0414fcac1d5ac1721fb3 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Fri, 23 Aug 2024 12:51:00 -0700 Subject: bpf: Add bpf_copy_from_user_str kfunc This adds a kfunc wrapper around strncpy_from_user, which can be called from sleepable BPF programs. This matches the non-sleepable 'bpf_probe_read_user_str' helper except it includes an additional 'flags' param, which allows consumers to clear the entire destination buffer on success or failure. Signed-off-by: Jordan Rome Link: https://lore.kernel.org/r/20240823195101.3621028-1-linux@jordanrome.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/helpers.c | 42 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 60 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2629457d72d..c3a5728db115 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7513,4 +7513,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f12f075b70c5..7e7761c537f8 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2962,6 +2962,47 @@ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) bpf_mem_free(&bpf_global_ma, kit->bits); } +/** + * bpf_copy_from_user_str() - Copy a string from an unsafe user address + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address, in user space. + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL-terminated string from userspace to BPF space. If user string is + * too long this will still ensure zero termination in the dst buffer unless + * buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and + * memset all of @dst on failure. + */ +__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(!dst__sz)) + return 0; + + ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst, 0, dst__sz); + + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset((char *)dst + ret, 0, dst__sz - ret); + else + ((char *)dst)[ret] = '\0'; + + return ret + 1; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3047,6 +3088,7 @@ BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..f329ee44627a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7512,4 +7512,13 @@ struct bpf_iter_num { __u64 __opaque[1]; } __attribute__((aligned(8))); +/* + * Flags to control BPF kfunc behaviour. + * - BPF_F_PAD_ZEROS: Pad destination buffer with zeros. (See the respective + * helper documentation for details.) + */ +enum bpf_kfunc_flags { + BPF_F_PAD_ZEROS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 3efd7ab46d0aebc3e567a9846e79a98dbad3291c Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:46 +0000 Subject: net: netdev netlink api to bind dma-buf to a net device API takes the dma-buf fd as input, and binds it to the netdevice. The user can specify the rx queues to bind the dma-buf to. Suggested-by: Stanislav Fomichev Signed-off-by: Mina Almasry Reviewed-by: Donald Hunter Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20240910171458.219195-3-almasrymina@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 47 +++++++++++++++++++++++++++++++++ include/uapi/linux/netdev.h | 11 ++++++++ net/core/netdev-genl-gen.c | 19 +++++++++++++ net/core/netdev-genl-gen.h | 2 ++ net/core/netdev-genl.c | 6 +++++ tools/include/uapi/linux/netdev.h | 11 ++++++++ 6 files changed, 96 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 959755be4d7f..4930e8142aa6 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -457,6 +457,39 @@ attribute-sets: Number of times driver re-started accepting send requests to this queue from the stack. type: uint + - + name: queue-id + subset-of: queue + attributes: + - + name: id + - + name: type + - + name: dmabuf + attributes: + - + name: ifindex + doc: netdev ifindex to bind the dmabuf to. + type: u32 + checks: + min: 1 + - + name: queues + doc: receive queues to bind the dmabuf to. + type: nest + nested-attributes: queue-id + multi-attr: true + - + name: fd + doc: dmabuf file descriptor to bind. + type: u32 + - + name: id + doc: id of the dmabuf binding + type: u32 + checks: + min: 1 operations: list: @@ -619,6 +652,20 @@ operations: - rx-bytes - tx-packets - tx-bytes + - + name: bind-rx + doc: Bind dmabuf to netdev + attribute-set: dmabuf + flags: [ admin-perm ] + do: + request: + attributes: + - ifindex + - fd + - queues + reply: + attributes: + - id mcast-groups: list: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 43742ac5b00d..91bf3ecc5f1d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -173,6 +173,16 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_DMABUF_IFINDEX = 1, + NETDEV_A_DMABUF_QUEUES, + NETDEV_A_DMABUF_FD, + NETDEV_A_DMABUF_ID, + + __NETDEV_A_DMABUF_MAX, + NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -186,6 +196,7 @@ enum { NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, + NETDEV_CMD_BIND_RX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8350a0afa9ec..6b7fe6035067 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -27,6 +27,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), }; +const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), @@ -74,6 +79,13 @@ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), }; +/* NETDEV_CMD_BIND_RX - do */ +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -151,6 +163,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_QSTATS_SCOPE, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_BIND_RX, + .doit = netdev_nl_bind_rx_doit, + .policy = netdev_bind_rx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 4db40fd5b4a9..67c34005750c 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; +extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); @@ -30,6 +31,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a17d7eaeb001..699c34b9b03c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -723,6 +723,12 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, return err; } +/* Stub */ +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 43742ac5b00d..91bf3ecc5f1d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -173,6 +173,16 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_DMABUF_IFINDEX = 1, + NETDEV_A_DMABUF_QUEUES, + NETDEV_A_DMABUF_FD, + NETDEV_A_DMABUF_ID, + + __NETDEV_A_DMABUF_MAX, + NETDEV_A_DMABUF_MAX = (__NETDEV_A_DMABUF_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -186,6 +196,7 @@ enum { NETDEV_CMD_QUEUE_GET, NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, + NETDEV_CMD_BIND_RX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From d0caf9876a1c9f844307effb598ad1312d9e0025 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:57 +0000 Subject: netdev: add dmabuf introspection Add dmabuf information to page_pool stats: $ ./cli.py --spec ../netlink/specs/netdev.yaml --dump page-pool-get ... {'dmabuf': 10, 'id': 456, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 455, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 454, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 453, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 452, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 451, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 450, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, {'dmabuf': 10, 'id': 449, 'ifindex': 3, 'inflight': 1023, 'inflight-mem': 4190208}, And queue stats: $ ./cli.py --spec ../netlink/specs/netdev.yaml --dump queue-get ... {'dmabuf': 10, 'id': 8, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 9, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 10, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 11, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 12, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 13, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 14, 'ifindex': 3, 'type': 'rx'}, {'dmabuf': 10, 'id': 15, 'ifindex': 3, 'type': 'rx'}, Suggested-by: Jakub Kicinski Signed-off-by: Mina Almasry Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20240910171458.219195-14-almasrymina@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 10 ++++++++++ include/uapi/linux/netdev.h | 2 ++ net/core/netdev-genl.c | 7 +++++++ net/core/page_pool_user.c | 5 +++++ tools/include/uapi/linux/netdev.h | 2 ++ 5 files changed, 26 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 0c747530c275..08412c279297 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -167,6 +167,10 @@ attribute-sets: "re-attached", they are just waiting to disappear. Attribute is absent if Page Pool has not been detached, and can still be used to allocate new memory. + - + name: dmabuf + doc: ID of the dmabuf this page-pool is attached to. + type: u32 - name: page-pool-info subset-of: page-pool @@ -268,6 +272,10 @@ attribute-sets: name: napi-id doc: ID of the NAPI instance which services this queue. type: u32 + - + name: dmabuf + doc: ID of the dmabuf attached to this queue, if any. + type: u32 - name: qstats @@ -543,6 +551,7 @@ operations: - inflight - inflight-mem - detach-time + - dmabuf dump: reply: *pp-reply config-cond: page-pool @@ -607,6 +616,7 @@ operations: - type - napi-id - ifindex + - dmabuf dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 91bf3ecc5f1d..7c308f04e7a0 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -93,6 +93,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, + NETDEV_A_PAGE_POOL_DMABUF, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -131,6 +132,7 @@ enum { NETDEV_A_QUEUE_IFINDEX, NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, + NETDEV_A_QUEUE_DMABUF, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9153a8ab0cf8..1cb954f2d39e 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -295,6 +295,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { + struct net_devmem_dmabuf_binding *binding; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -314,6 +315,12 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; + + binding = rxq->mp_params.mp_priv; + if (binding && + nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + goto nla_put_failure; + break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index cd6267ba6fa3..48335766c1bf 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -9,6 +9,7 @@ #include #include +#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -213,6 +214,7 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; void *hdr; @@ -242,6 +244,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; + if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + goto err_cancel; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 91bf3ecc5f1d..7c308f04e7a0 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -93,6 +93,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, + NETDEV_A_PAGE_POOL_DMABUF, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -131,6 +132,7 @@ enum { NETDEV_A_QUEUE_IFINDEX, NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, + NETDEV_A_QUEUE_DMABUF, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) -- cgit v1.2.3 From dc1e764b398e0548b1bb12fb234ed0b673cd60fb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2024 13:08:39 -0300 Subject: tools headers UAPI: Sync the linux/in.h with the kernel sources Picking the changes from: 70d0bb45fae87a3b ("net: Correct spelling in headers") Just a comment fix, addressing this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h Please see tools/include/uapi/README for details. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Simon Horman Link: https://lore.kernel.org/lkml/ZvrNlLdtXAZ1sIIj@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/in.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index d358add1611c..5d32d53508d9 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -141,7 +141,7 @@ struct in_addr { */ #define IP_PMTUDISC_INTERFACE 4 /* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get - * fragmented if they exeed the interface mtu + * fragmented if they exceed the interface mtu */ #define IP_PMTUDISC_OMIT 5 -- cgit v1.2.3 From 3ed6be68913b2d56a35d30c67f83ba3d2f1998fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Oct 2024 21:41:05 +0200 Subject: bpf: Sync uapi bpf.h header to tools directory There is a delta between kernel UAPI bpf.h and tools UAPI bpf.h, thus sync them again. Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1fb3cb2636e6..e8241b320c6d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5519,11 +5519,12 @@ union bpf_attr { * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. * - * void *bpf_kptr_xchg(void *map_value, void *ptr) + * void *bpf_kptr_xchg(void *dst, void *ptr) * Description - * Exchange kptr at pointer *map_value* with *ptr*, and return the - * old value. *ptr* can be NULL, otherwise it must be a referenced - * pointer which will be released when this helper is called. + * Exchange kptr at pointer *dst* with *ptr*, and return the old value. + * *dst* can be map value or local kptr. *ptr* can be NULL, otherwise + * it must be a referenced pointer which will be released when this helper + * is called. * Return * The old value of kptr (which can be NULL). The returned pointer * if not NULL, is a reference which must be released using its @@ -6046,11 +6047,6 @@ enum { BPF_F_MARK_ENFORCE = (1ULL << 6), }; -/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -enum { - BPF_F_INGRESS = (1ULL << 0), -}; - /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ enum { BPF_F_TUNINFO_IPV6 = (1ULL << 0), @@ -6197,10 +6193,12 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; -/* Flags for bpf_redirect_map helper */ +/* Flags for bpf_redirect and bpf_redirect_map helpers */ enum { - BPF_F_BROADCAST = (1ULL << 3), - BPF_F_EXCLUDE_INGRESS = (1ULL << 4), + BPF_F_INGRESS = (1ULL << 0), /* used for skb path */ + BPF_F_BROADCAST = (1ULL << 3), /* used for XDP path */ + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), /* used for XDP path */ +#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) }; #define __bpf_md_ptr(type, name) \ -- cgit v1.2.3 From 4f647a780f3606acbd2116248d51eadb4d865615 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 16 Sep 2024 02:17:10 -0700 Subject: bpf: __bpf_fastcall for bpf_get_smp_processor_id in uapi Since [1] kernel supports __bpf_fastcall attribute for helper function bpf_get_smp_processor_id(). Update uapi definition for this helper in order to have this attribute in the generated bpf_helper_defs.h [1] commit 91b7fbf3936f ("bpf, x86, riscv, arm: no_caller_saved_registers for bpf_get_smp_processor_id()") Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240916091712.2929279-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ tools/include/uapi/linux/bpf.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c6cd7c7aeeee..8ab4d8184b9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1970,6 +1970,8 @@ union bpf_attr { * program. * Return * The SMP id of the processor running the program. + * Attributes + * __bpf_fastcall * * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1fb3cb2636e6..7610883c8191 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1970,6 +1970,8 @@ union bpf_attr { * program. * Return * The SMP id of the processor running the program. + * Attributes + * __bpf_fastcall * * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description -- cgit v1.2.3 From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 4 ++++ Documentation/networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 4 ++++ tools/include/uapi/linux/if_link.h | 1 + 6 files changed, 15 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 0c4d5d40cae9..d7131a1afadf 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1137,6 +1137,10 @@ attribute-sets: name: dpll-pin type: nest nested-attributes: link-dpll-pin-attrs + - + name: max-pacing-offload-horizon + type: uint + doc: EDT offload horizon supported by the device (in nsec). - name: af-spec-attrs attributes: diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a..49f03cb78c6e 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -183,3 +183,4 @@ struct_devlink_port* devlink_port struct_dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder +u64 max_pacing_offload_horizon diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..506ba9c80e83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0a520987085..682d8d3127db 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } @@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || @@ -1975,6 +1978,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index f0d71b2a3f1e..96ec2b01e725 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; -- cgit v1.2.3 From 107525833bcedb9d7c2c6a21abb8b9747410f364 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Oct 2024 12:13:34 +0200 Subject: tools: Sync if_link.h uapi tooling header Sync if_link uapi header to the latest version as we need the refresher in tooling for netkit device. Given it's been a while since the last sync and the diff is fairly big, it has been done as its own commit. Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20241004101335.117711-4-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/if_link.h | 553 ++++++++++++++++++++++++++++++++++++- 1 file changed, 552 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index f0d71b2a3f1e..2acc7687e017 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -461,6 +461,286 @@ enum in6_addr_gen_mode { /* Bridge section */ +/** + * DOC: Bridge enum definition + * + * Please *note* that the timer values in the following section are expected + * in clock_t format, which is seconds multiplied by USER_HZ (generally + * defined as 100). + * + * @IFLA_BR_FORWARD_DELAY + * The bridge forwarding delay is the time spent in LISTENING state + * (before moving to LEARNING) and in LEARNING state (before moving + * to FORWARDING). Only relevant if STP is enabled. + * + * The valid values are between (2 * USER_HZ) and (30 * USER_HZ). + * The default value is (15 * USER_HZ). + * + * @IFLA_BR_HELLO_TIME + * The time between hello packets sent by the bridge, when it is a root + * bridge or a designated bridge. Only relevant if STP is enabled. + * + * The valid values are between (1 * USER_HZ) and (10 * USER_HZ). + * The default value is (2 * USER_HZ). + * + * @IFLA_BR_MAX_AGE + * The hello packet timeout is the time until another bridge in the + * spanning tree is assumed to be dead, after reception of its last hello + * message. Only relevant if STP is enabled. + * + * The valid values are between (6 * USER_HZ) and (40 * USER_HZ). + * The default value is (20 * USER_HZ). + * + * @IFLA_BR_AGEING_TIME + * Configure the bridge's FDB entries aging time. It is the time a MAC + * address will be kept in the FDB after a packet has been received from + * that address. After this time has passed, entries are cleaned up. + * Allow values outside the 802.1 standard specification for special cases: + * + * * 0 - entry never ages (all permanent) + * * 1 - entry disappears (no persistence) + * + * The default value is (300 * USER_HZ). + * + * @IFLA_BR_STP_STATE + * Turn spanning tree protocol on (*IFLA_BR_STP_STATE* > 0) or off + * (*IFLA_BR_STP_STATE* == 0) for this bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_PRIORITY + * Set this bridge's spanning tree priority, used during STP root bridge + * election. + * + * The valid values are between 0 and 65535. + * + * @IFLA_BR_VLAN_FILTERING + * Turn VLAN filtering on (*IFLA_BR_VLAN_FILTERING* > 0) or off + * (*IFLA_BR_VLAN_FILTERING* == 0). When disabled, the bridge will not + * consider the VLAN tag when handling packets. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_PROTOCOL + * Set the protocol used for VLAN filtering. + * + * The valid values are 0x8100(802.1Q) or 0x88A8(802.1AD). The default value + * is 0x8100(802.1Q). + * + * @IFLA_BR_GROUP_FWD_MASK + * The group forwarding mask. This is the bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses (of the form 01:80:C2:00:00:0X). + * + * The default value is 0, which means the bridge does not forward any + * link-local frames coming on this port. + * + * @IFLA_BR_ROOT_ID + * The bridge root id, read only. + * + * @IFLA_BR_BRIDGE_ID + * The bridge id, read only. + * + * @IFLA_BR_ROOT_PORT + * The bridge root port, read only. + * + * @IFLA_BR_ROOT_PATH_COST + * The bridge root path cost, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE + * The bridge topology change, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_DETECTED + * The bridge topology change detected, read only. + * + * @IFLA_BR_HELLO_TIMER + * The bridge hello timer, read only. + * + * @IFLA_BR_TCN_TIMER + * The bridge tcn timer, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_TIMER + * The bridge topology change timer, read only. + * + * @IFLA_BR_GC_TIMER + * The bridge gc timer, read only. + * + * @IFLA_BR_GROUP_ADDR + * Set the MAC address of the multicast group this bridge uses for STP. + * The address must be a link-local address in standard Ethernet MAC address + * format. It is an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f]. + * + * The default value is 0. + * + * @IFLA_BR_FDB_FLUSH + * Flush bridge's fdb dynamic entries. + * + * @IFLA_BR_MCAST_ROUTER + * Set bridge's multicast router if IGMP snooping is enabled. + * The valid values are: + * + * * 0 - disabled. + * * 1 - automatic (queried). + * * 2 - permanently enabled. + * + * The default value is 1. + * + * @IFLA_BR_MCAST_SNOOPING + * Turn multicast snooping on (*IFLA_BR_MCAST_SNOOPING* > 0) or off + * (*IFLA_BR_MCAST_SNOOPING* == 0). + * + * The default value is 1. + * + * @IFLA_BR_MCAST_QUERY_USE_IFADDR + * If enabled use the bridge's own IP address as source address for IGMP + * queries (*IFLA_BR_MCAST_QUERY_USE_IFADDR* > 0) or the default of 0.0.0.0 + * (*IFLA_BR_MCAST_QUERY_USE_IFADDR* == 0). + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_QUERIER + * Enable (*IFLA_BR_MULTICAST_QUERIER* > 0) or disable + * (*IFLA_BR_MULTICAST_QUERIER* == 0) IGMP querier, ie sending of multicast + * queries by the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_HASH_ELASTICITY + * Set multicast database hash elasticity, It is the maximum chain length in + * the multicast hash table. This attribute is *deprecated* and the value + * is always 16. + * + * @IFLA_BR_MCAST_HASH_MAX + * Set maximum size of the multicast hash table + * + * The default value is 4096, the value must be a power of 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_CNT + * The Last Member Query Count is the number of Group-Specific Queries + * sent before the router assumes there are no local members. The Last + * Member Query Count is also the number of Group-and-Source-Specific + * Queries sent before the router assumes there are no listeners for a + * particular source. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_STARTUP_QUERY_CNT + * The Startup Query Count is the number of Queries sent out on startup, + * separated by the Startup Query Interval. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_INTVL + * The Last Member Query Interval is the Max Response Time inserted into + * Group-Specific Queries sent in response to Leave Group messages, and + * is also the amount of time between Group-Specific Query messages. + * + * The default value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_MEMBERSHIP_INTVL + * The interval after which the bridge will leave a group, if no membership + * reports for this group are received. + * + * The default value is (260 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERIER_INTVL + * The interval between queries sent by other routers. if no queries are + * seen after this delay has passed, the bridge will start to send its own + * queries (as if *IFLA_BR_MCAST_QUERIER_INTVL* was enabled). + * + * The default value is (255 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_INTVL + * The Query Interval is the interval between General Queries sent by + * the Querier. + * + * The default value is (125 * USER_HZ). The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_RESPONSE_INTVL + * The Max Response Time used to calculate the Max Resp Code inserted + * into the periodic General Queries. + * + * The default value is (10 * USER_HZ). + * + * @IFLA_BR_MCAST_STARTUP_QUERY_INTVL + * The interval between queries in the startup phase. + * + * The default value is (125 * USER_HZ) / 4. The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_NF_CALL_IPTABLES + * Enable (*NF_CALL_IPTABLES* > 0) or disable (*NF_CALL_IPTABLES* == 0) + * iptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_IP6TABLES + * Enable (*NF_CALL_IP6TABLES* > 0) or disable (*NF_CALL_IP6TABLES* == 0) + * ip6tables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_ARPTABLES + * Enable (*NF_CALL_ARPTABLES* > 0) or disable (*NF_CALL_ARPTABLES* == 0) + * arptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_DEFAULT_PVID + * VLAN ID applied to untagged and priority-tagged incoming packets. + * + * The default value is 1. Setting to the special value 0 makes all ports of + * this bridge not have a PVID by default, which means that they will + * not accept VLAN-untagged traffic. + * + * @IFLA_BR_PAD + * Bridge attribute padding type for netlink message. + * + * @IFLA_BR_VLAN_STATS_ENABLED + * Enable (*IFLA_BR_VLAN_STATS_ENABLED* == 1) or disable + * (*IFLA_BR_VLAN_STATS_ENABLED* == 0) per-VLAN stats accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_STATS_ENABLED + * Enable (*IFLA_BR_MCAST_STATS_ENABLED* > 0) or disable + * (*IFLA_BR_MCAST_STATS_ENABLED* == 0) multicast (IGMP/MLD) stats + * accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_IGMP_VERSION + * Set the IGMP version. + * + * The valid values are 2 and 3. The default value is 2. + * + * @IFLA_BR_MCAST_MLD_VERSION + * Set the MLD version. + * + * The valid values are 1 and 2. The default value is 1. + * + * @IFLA_BR_VLAN_STATS_PER_PORT + * Enable (*IFLA_BR_VLAN_STATS_PER_PORT* == 1) or disable + * (*IFLA_BR_VLAN_STATS_PER_PORT* == 0) per-VLAN per-port stats accounting. + * Can be changed only when there are no port VLANs configured. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MULTI_BOOLOPT + * The multi_boolopt is used to control new boolean options to avoid adding + * new netlink attributes. You can look at ``enum br_boolopt_id`` for those + * options. + * + * @IFLA_BR_MCAST_QUERIER_STATE + * Bridge mcast querier states, read only. + * + * @IFLA_BR_FDB_N_LEARNED + * The number of dynamically learned FDB entries for the current bridge, + * read only. + * + * @IFLA_BR_FDB_MAX_LEARNED + * Set the number of max dynamically learned FDB entries for the current + * bridge. + */ enum { IFLA_BR_UNSPEC, IFLA_BR_FORWARD_DELAY, @@ -510,6 +790,8 @@ enum { IFLA_BR_VLAN_STATS_PER_PORT, IFLA_BR_MULTI_BOOLOPT, IFLA_BR_MCAST_QUERIER_STATE, + IFLA_BR_FDB_N_LEARNED, + IFLA_BR_FDB_MAX_LEARNED, __IFLA_BR_MAX, }; @@ -520,11 +802,252 @@ struct ifla_bridge_id { __u8 addr[6]; /* ETH_ALEN */ }; +/** + * DOC: Bridge mode enum definition + * + * @BRIDGE_MODE_HAIRPIN + * Controls whether traffic may be sent back out of the port on which it + * was received. This option is also called reflective relay mode, and is + * used to support basic VEPA (Virtual Ethernet Port Aggregator) + * capabilities. By default, this flag is turned off and the bridge will + * not forward traffic back out of the receiving port. + */ enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, }; +/** + * DOC: Bridge port enum definition + * + * @IFLA_BRPORT_STATE + * The operation state of the port. Here are the valid values. + * + * * 0 - port is in STP *DISABLED* state. Make this port completely + * inactive for STP. This is also called BPDU filter and could be used + * to disable STP on an untrusted port, like a leaf virtual device. + * The traffic forwarding is also stopped on this port. + * * 1 - port is in STP *LISTENING* state. Only valid if STP is enabled + * on the bridge. In this state the port listens for STP BPDUs and + * drops all other traffic frames. + * * 2 - port is in STP *LEARNING* state. Only valid if STP is enabled on + * the bridge. In this state the port will accept traffic only for the + * purpose of updating MAC address tables. + * * 3 - port is in STP *FORWARDING* state. Port is fully active. + * * 4 - port is in STP *BLOCKING* state. Only valid if STP is enabled on + * the bridge. This state is used during the STP election process. + * In this state, port will only process STP BPDUs. + * + * @IFLA_BRPORT_PRIORITY + * The STP port priority. The valid values are between 0 and 255. + * + * @IFLA_BRPORT_COST + * The STP path cost of the port. The valid values are between 1 and 65535. + * + * @IFLA_BRPORT_MODE + * Set the bridge port mode. See *BRIDGE_MODE_HAIRPIN* for more details. + * + * @IFLA_BRPORT_GUARD + * Controls whether STP BPDUs will be processed by the bridge port. By + * default, the flag is turned off to allow BPDU processing. Turning this + * flag on will disable the bridge port if a STP BPDU packet is received. + * + * If the bridge has Spanning Tree enabled, hostile devices on the network + * may send BPDU on a port and cause network failure. Setting *guard on* + * will detect and stop this by disabling the port. The port will be + * restarted if the link is brought down, or removed and reattached. + * + * @IFLA_BRPORT_PROTECT + * Controls whether a given port is allowed to become a root port or not. + * Only used when STP is enabled on the bridge. By default the flag is off. + * + * This feature is also called root port guard. If BPDU is received from a + * leaf (edge) port, it should not be elected as root port. This could + * be used if using STP on a bridge and the downstream bridges are not fully + * trusted; this prevents a hostile guest from rerouting traffic. + * + * @IFLA_BRPORT_FAST_LEAVE + * This flag allows the bridge to immediately stop multicast traffic + * forwarding on a port that receives an IGMP Leave message. It is only used + * when IGMP snooping is enabled on the bridge. By default the flag is off. + * + * @IFLA_BRPORT_LEARNING + * Controls whether a given port will learn *source* MAC addresses from + * received traffic or not. Also controls whether dynamic FDB entries + * (which can also be added by software) will be refreshed by incoming + * traffic. By default this flag is on. + * + * @IFLA_BRPORT_UNICAST_FLOOD + * Controls whether unicast traffic for which there is no FDB entry will + * be flooded towards this port. By default this flag is on. + * + * @IFLA_BRPORT_PROXYARP + * Enable proxy ARP on this port. + * + * @IFLA_BRPORT_LEARNING_SYNC + * Controls whether a given port will sync MAC addresses learned on device + * port to bridge FDB. + * + * @IFLA_BRPORT_PROXYARP_WIFI + * Enable proxy ARP on this port which meets extended requirements by + * IEEE 802.11 and Hotspot 2.0 specifications. + * + * @IFLA_BRPORT_ROOT_ID + * + * @IFLA_BRPORT_BRIDGE_ID + * + * @IFLA_BRPORT_DESIGNATED_PORT + * + * @IFLA_BRPORT_DESIGNATED_COST + * + * @IFLA_BRPORT_ID + * + * @IFLA_BRPORT_NO + * + * @IFLA_BRPORT_TOPOLOGY_CHANGE_ACK + * + * @IFLA_BRPORT_CONFIG_PENDING + * + * @IFLA_BRPORT_MESSAGE_AGE_TIMER + * + * @IFLA_BRPORT_FORWARD_DELAY_TIMER + * + * @IFLA_BRPORT_HOLD_TIMER + * + * @IFLA_BRPORT_FLUSH + * Flush bridge ports' fdb dynamic entries. + * + * @IFLA_BRPORT_MULTICAST_ROUTER + * Configure the port's multicast router presence. A port with + * a multicast router will receive all multicast traffic. + * The valid values are: + * + * * 0 disable multicast routers on this port + * * 1 let the system detect the presence of routers (default) + * * 2 permanently enable multicast traffic forwarding on this port + * * 3 enable multicast routers temporarily on this port, not depending + * on incoming queries. + * + * @IFLA_BRPORT_PAD + * + * @IFLA_BRPORT_MCAST_FLOOD + * Controls whether a given port will flood multicast traffic for which + * there is no MDB entry. By default this flag is on. + * + * @IFLA_BRPORT_MCAST_TO_UCAST + * Controls whether a given port will replicate packets using unicast + * instead of multicast. By default this flag is off. + * + * This is done by copying the packet per host and changing the multicast + * destination MAC to a unicast one accordingly. + * + * *mcast_to_unicast* works on top of the multicast snooping feature of the + * bridge. Which means unicast copies are only delivered to hosts which + * are interested in unicast and signaled this via IGMP/MLD reports previously. + * + * This feature is intended for interface types which have a more reliable + * and/or efficient way to deliver unicast packets than broadcast ones + * (e.g. WiFi). + * + * However, it should only be enabled on interfaces where no IGMPv2/MLDv1 + * report suppression takes place. IGMP/MLD report suppression issue is + * usually overcome by the network daemon (supplicant) enabling AP isolation + * and by that separating all STAs. + * + * Delivery of STA-to-STA IP multicast is made possible again by enabling + * and utilizing the bridge hairpin mode, which considers the incoming port + * as a potential outgoing port, too (see *BRIDGE_MODE_HAIRPIN* option). + * Hairpin mode is performed after multicast snooping, therefore leading + * to only deliver reports to STAs running a multicast router. + * + * @IFLA_BRPORT_VLAN_TUNNEL + * Controls whether vlan to tunnel mapping is enabled on the port. + * By default this flag is off. + * + * @IFLA_BRPORT_BCAST_FLOOD + * Controls flooding of broadcast traffic on the given port. By default + * this flag is on. + * + * @IFLA_BRPORT_GROUP_FWD_MASK + * Set the group forward mask. This is a bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses. The addresses of the form are 01:80:C2:00:00:0X (defaults + * to 0, which means the bridge does not forward any link-local frames + * coming on this port). + * + * @IFLA_BRPORT_NEIGH_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression + * is enabled on the port. By default this flag is off. + * + * @IFLA_BRPORT_ISOLATED + * Controls whether a given port will be isolated, which means it will be + * able to communicate with non-isolated ports only. By default this + * flag is off. + * + * @IFLA_BRPORT_BACKUP_PORT + * Set a backup port. If the port loses carrier all traffic will be + * redirected to the configured backup port. Set the value to 0 to disable + * it. + * + * @IFLA_BRPORT_MRP_RING_OPEN + * + * @IFLA_BRPORT_MRP_IN_OPEN + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + * The number of per-port EHT hosts limit. The default value is 512. + * Setting to 0 is not allowed. + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_CNT + * The current number of tracked hosts, read only. + * + * @IFLA_BRPORT_LOCKED + * Controls whether a port will be locked, meaning that hosts behind the + * port will not be able to communicate through the port unless an FDB + * entry with the unit's MAC address is in the FDB. The common use case is + * that hosts are allowed access through authentication with the IEEE 802.1X + * protocol or based on whitelists. By default this flag is off. + * + * Please note that secure 802.1X deployments should always use the + * *BR_BOOLOPT_NO_LL_LEARN* flag, to not permit the bridge to populate its + * FDB based on link-local (EAPOL) traffic received on the port. + * + * @IFLA_BRPORT_MAB + * Controls whether a port will use MAC Authentication Bypass (MAB), a + * technique through which select MAC addresses may be allowed on a locked + * port, without using 802.1X authentication. Packets with an unknown source + * MAC address generates a "locked" FDB entry on the incoming bridge port. + * The common use case is for user space to react to these bridge FDB + * notifications and optionally replace the locked FDB entry with a normal + * one, allowing traffic to pass for whitelisted MAC addresses. + * + * Setting this flag also requires *IFLA_BRPORT_LOCKED* and + * *IFLA_BRPORT_LEARNING*. *IFLA_BRPORT_LOCKED* ensures that unauthorized + * data packets are dropped, and *IFLA_BRPORT_LEARNING* allows the dynamic + * FDB entries installed by user space (as replacements for the locked FDB + * entries) to be refreshed and/or aged out. + * + * @IFLA_BRPORT_MCAST_N_GROUPS + * + * @IFLA_BRPORT_MCAST_MAX_GROUPS + * Sets the maximum number of MDB entries that can be registered for a + * given port. Attempts to register more MDB entries at the port than this + * limit allows will be rejected, whether they are done through netlink + * (e.g. the bridge tool), or IGMP or MLD membership reports. Setting a + * limit of 0 disables the limit. The default value is 0. + * + * @IFLA_BRPORT_NEIGH_VLAN_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression is + * enabled for a given port. By default this flag is off. + * + * Note that this option only takes effect when *IFLA_BRPORT_NEIGH_SUPPRESS* + * is enabled for a given port. + * + * @IFLA_BRPORT_BACKUP_NHID + * The FDB nexthop object ID to attach to packets being redirected to a + * backup port that has VLAN tunnel mapping enabled (via the + * *IFLA_BRPORT_VLAN_TUNNEL* option). Setting a value of 0 (default) has + * the effect of not attaching any ID. + */ enum { IFLA_BRPORT_UNSPEC, IFLA_BRPORT_STATE, /* Spanning tree state */ @@ -769,6 +1292,19 @@ enum netkit_mode { NETKIT_L3, }; +/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to + * the BPF program if attached. This also means the latter can + * consume the two fields if they were populated earlier. + * + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before + * invoking the attached BPF program when the peer device resides + * in a different network namespace. This is the default behavior. + */ +enum netkit_scrub { + NETKIT_SCRUB_NONE, + NETKIT_SCRUB_DEFAULT, +}; + enum { IFLA_NETKIT_UNSPEC, IFLA_NETKIT_PEER_INFO, @@ -776,6 +1312,8 @@ enum { IFLA_NETKIT_POLICY, IFLA_NETKIT_PEER_POLICY, IFLA_NETKIT_MODE, + IFLA_NETKIT_SCRUB, + IFLA_NETKIT_PEER_SCRUB, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) @@ -854,6 +1392,7 @@ enum { IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -871,6 +1410,13 @@ enum ifla_vxlan_df { VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; +enum ifla_vxlan_label_policy { + VXLAN_LABEL_FIXED = 0, + VXLAN_LABEL_INHERIT = 1, + __VXLAN_LABEL_END, + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, @@ -935,6 +1481,8 @@ enum { IFLA_GTP_ROLE, IFLA_GTP_CREATE_SOCKETS, IFLA_GTP_RESTART_COUNT, + IFLA_GTP_LOCAL, + IFLA_GTP_LOCAL6, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) @@ -1240,6 +1788,7 @@ enum { IFLA_HSR_PROTOCOL, /* Indicate different protocol than * HSR. For example PRP. */ + IFLA_HSR_INTERLINK, /* HSR interlink network device */ __IFLA_HSR_MAX, }; @@ -1417,7 +1966,9 @@ enum { enum { IFLA_DSA_UNSPEC, - IFLA_DSA_MASTER, + IFLA_DSA_CONDUIT, + /* Deprecated, use IFLA_DSA_CONDUIT instead */ + IFLA_DSA_MASTER = IFLA_DSA_CONDUIT, __IFLA_DSA_MAX, }; -- cgit v1.2.3 From 5bd48a3a14df4b3ee1be0757efcc0f40d4f57b35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 10 Oct 2024 04:56:52 +0100 Subject: bpf: fix argument type in bpf_loop documentation The `index` argument to bpf_loop() is threaded as an u64. This lead in a subtle verifier denial where clang cloned the argument in another register[1]. [1] https://github.com/systemd/systemd/pull/34650#issuecomment-2401092895 Signed-off-by: Matteo Croce Link: https://lore.kernel.org/r/20241010035652.17830-1-technoboy85@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8ab4d8184b9d..874af0186fe8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5371,7 +5371,7 @@ union bpf_attr { * Currently, the **flags** must be 0. Currently, nr_loops is * limited to 1 << 23 (~8 million) loops. * - * long (\*callback_fn)(u32 index, void \*ctx); + * long (\*callback_fn)(u64 index, void \*ctx); * * where **index** is the current index in the loop. The index * is zero-indexed. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d9b38ffd220..cfc62e0776bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9917,7 +9917,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, { /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, * u64 flags); - * callback_fn(u32 index, void *callback_ctx); + * callback_fn(u64 index, void *callback_ctx); */ callee->regs[BPF_REG_1].type = SCALAR_VALUE; callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7610883c8191..5937c39069ba 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5371,7 +5371,7 @@ union bpf_attr { * Currently, the **flags** must be 0. Currently, nr_loops is * limited to 1 << 23 (~8 million) loops. * - * long (\*callback_fn)(u32 index, void \*ctx); + * long (\*callback_fn)(u64 index, void \*ctx); * * where **index** is the current index in the loop. The index * is zero-indexed. -- cgit v1.2.3 From c6ca31981b545ad3081007b6aa88b6aab1b0cece Mon Sep 17 00:00:00 2001 From: Martin Kelly Date: Thu, 10 Oct 2024 12:33:01 -0700 Subject: bpf: Update bpf_override_return() comment The documentation says CONFIG_FUNCTION_ERROR_INJECTION is supported only on x86. This was presumably true at the time of writing, but it's now supported on many other architectures too. Drop this statement, since it's not correct anymore and it fits better in other documentation anyway. Signed-off-by: Martin Kelly Link: https://lore.kernel.org/r/20241010193301.995909-1-martin.kelly@crowdstrike.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 ---- tools/include/uapi/linux/bpf.h | 4 ---- 2 files changed, 8 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 874af0186fe8..627c4195f04f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3103,10 +3103,6 @@ union bpf_attr { * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration * option, and in this case it only works on functions tagged with * **ALLOW_ERROR_INJECTION** in the kernel code. - * - * Also, the helper is only available for the architectures having - * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, - * x86 architecture is the only one to support this feature. * Return * 0 * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5937c39069ba..0e49ce2981a0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3103,10 +3103,6 @@ union bpf_attr { * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration * option, and in this case it only works on functions tagged with * **ALLOW_ERROR_INJECTION** in the kernel code. - * - * Also, the helper is only available for the architectures having - * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, - * x86 architecture is the only one to support this feature. * Return * 0 * -- cgit v1.2.3 From 516010460011ae74ac3b7383cf90ed27e2711cd6 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:57 +0000 Subject: netdev-genl: Dump napi_defer_hard_irqs Support dumping defer_hard_irqs for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 8 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 16 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 08412c279297..585e87ec3c16 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -248,6 +248,13 @@ attribute-sets: threaded mode. If NAPI is not in threaded mode (i.e. uses normal softirq context), the attribute will be absent. type: u32 + - + name: defer-hard-irqs + doc: The number of consecutive empty polls before IRQ deferral ends + and hardware IRQs are re-enabled. + type: u32 + checks: + max: s32-max - name: queue attributes: @@ -636,6 +643,7 @@ operations: - ifindex - irq - pid + - defer-hard-irqs dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 358cba248796..f98e5d1d0d21 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -189,6 +190,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, goto nla_put_failure; } + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 0137891e74576f77a7901718dc0ce08ca074ae74 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:59 +0000 Subject: netdev-genl: Dump gro_flush_timeout Support dumping gro_flush_timeout for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-5-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 9 +++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 17 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 585e87ec3c16..7b47454c51dd 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -255,6 +255,14 @@ attribute-sets: type: u32 checks: max: s32-max + - + name: gro-flush-timeout + doc: The timeout, in nanoseconds, of when to trigger the NAPI watchdog + timer which schedules NAPI processing. Additionally, a non-zero + value will also prevent GRO from flushing recent super-frames at + the end of a NAPI cycle. This may add receive latency in exchange + for reducing the number of frames processed by the network stack. + type: uint - name: queue attributes: @@ -644,6 +652,7 @@ operations: - irq - pid - defer-hard-irqs + - gro-flush-timeout dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index f98e5d1d0d21..ac19f2e6cfbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -195,6 +196,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 1287c1ae0fc227e5acef11a539eb4e75646e31c7 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:01 +0000 Subject: netdev-genl: Support setting per-NAPI config values Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-7-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 11 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 18 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 45 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 77 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 7b47454c51dd..f9cb97d6106c 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -693,6 +693,17 @@ operations: reply: attributes: - id + - + name: napi-set + doc: Set configurable NAPI instance settings. + attribute-set: napi + flags: [ admin-perm ] + do: + request: + attributes: + - id + - defer-hard-irqs + - gro-flush-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index b28424ae06d5..e197bd84997c 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = .max = 2147483647ULL, }; +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = 2147483647ULL, +}; + /* Common nested types */ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), @@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd042..e09dd7539ff2 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index ac19f2e6cfbe..b49c3b4e5fbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 gro_flush_timeout = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + rtnl_unlock(); + + return err; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From ab8aaab874c4aa378e76d0a55ce6e0fad6e042a2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 11 Oct 2024 15:20:17 -0300 Subject: tools headers UAPI: Sync linux/const.h with the kernel headers To pick up the changes in: 947697c6f0f75f98 ("uapi: Define GENMASK_U128") That causes no changes in tooling, just addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/const.h include/uapi/linux/const.h Cc: Adrian Hunter Cc: Anshuman Khandual Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Yury Norov Link: https://lore.kernel.org/lkml/ZwltGNJwujKu1Fgn@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/const.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/const.h b/tools/include/uapi/linux/const.h index a429381e7ca5..e16be0d37746 100644 --- a/tools/include/uapi/linux/const.h +++ b/tools/include/uapi/linux/const.h @@ -28,6 +28,23 @@ #define _BITUL(x) (_UL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x)) +#if !defined(__ASSEMBLY__) +/* + * Missing asm support + * + * __BIT128() would not work in the asm code, as it shifts an + * 'unsigned __init128' data type as direct representation of + * 128 bit constants is not supported in the gcc compiler, as + * they get silently truncated. + * + * TODO: Please revisit this implementation when gcc compiler + * starts representing 128 bit constants directly like long + * and unsigned long etc. Subsequently drop the comment for + * GENMASK_U128() which would then start supporting asm code. + */ +#define _BIT128(x) ((unsigned __int128)(1) << (x)) +#endif + #define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1) #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) -- cgit v1.2.3 From c2f803052bc7a7feb2e03befccc8e49b6ff1f5f5 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Oct 2024 09:35:57 +0800 Subject: bpf: Add the missing BPF_LINK_TYPE invocation for sockmap There is an out-of-bounds read in bpf_link_show_fdinfo() for the sockmap link fd. Fix it by adding the missing BPF_LINK_TYPE invocation for sockmap link Also add comments for bpf_link_type to prevent missing updates in the future. Fixes: 699c23f02c65 ("bpf: Add bpf_link support for sk_msg and sk_skb progs") Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241024013558.1135167-2-houtao@huaweicloud.com --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 3 +++ tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 7 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9f2a6b83b49e..fa78f49d4a9a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -146,6 +146,7 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter) BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx) BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit) +BPF_LINK_TYPE(BPF_LINK_TYPE_SOCKMAP, sockmap) #endif #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e8241b320c6d..4a939c90dc2e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1121,6 +1121,9 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* Add BPF_LINK_TYPE(type, name) in bpf_types.h to keep bpf_link_type_strs[] + * in sync with the definitions below. + */ enum bpf_link_type { BPF_LINK_TYPE_UNSPEC = 0, BPF_LINK_TYPE_RAW_TRACEPOINT = 1, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e8241b320c6d..4a939c90dc2e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1121,6 +1121,9 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* Add BPF_LINK_TYPE(type, name) in bpf_types.h to keep bpf_link_type_strs[] + * in sync with the definitions below. + */ enum bpf_link_type { BPF_LINK_TYPE_UNSPEC = 0, BPF_LINK_TYPE_RAW_TRACEPOINT = 1, -- cgit v1.2.3 From 21a3a3d015aeee2402d14b425197d70aa3bd0d91 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 28 Oct 2024 10:55:09 -0300 Subject: tools headers: Synchronize {uapi/}linux/bits.h with the kernel sources To pick up the changes in this cset: 947697c6f0f75f98 ("uapi: Define GENMASK_U128") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/bits.h include/uapi/linux/bits.h diff -u tools/include/linux/bits.h include/linux/bits.h Please see tools/include/uapi/README for further details. Acked-by: Yury Norov Cc: Adrian Hunter Cc: Anshuman Khandual Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/Zx-ZVH7bHqtFn8Dv@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/bits.h | 15 +++++++++++++++ tools/include/uapi/linux/bits.h | 3 +++ 2 files changed, 18 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 0eb24d21aac2..60044b608817 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -36,4 +36,19 @@ #define GENMASK_ULL(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) +#if !defined(__ASSEMBLY__) +/* + * Missing asm support + * + * __GENMASK_U128() depends on _BIT128() which would not work + * in the asm code, as it shifts an 'unsigned __init128' data + * type instead of direct representation of 128 bit constants + * such as long and unsigned long. The fundamental problem is + * that a 128 bit constant will get silently truncated by the + * gcc compiler. + */ +#define GENMASK_U128(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l)) +#endif + #endif /* __LINUX_BITS_H */ diff --git a/tools/include/uapi/linux/bits.h b/tools/include/uapi/linux/bits.h index 3c2a101986a3..5ee30f882736 100644 --- a/tools/include/uapi/linux/bits.h +++ b/tools/include/uapi/linux/bits.h @@ -12,4 +12,7 @@ (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_U128(h, l) \ + ((_BIT128((h)) << 1) - (_BIT128(l))) + #endif /* _UAPI_LINUX_BITS_H */ -- cgit v1.2.3 From d920179b3d4842a0e27cae54fdddbe5ef3977e73 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:34 +0100 Subject: bpf: Add support for uprobe multi session attach Adding support to attach BPF program for entry and return probe of the same function. This is common use case which at the moment requires to create two uprobe multi links. Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the BPF program on return probe simply by returning zero or non zero from the entry BPF program execution to execute or not the BPF program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/verifier.c | 1 + kernel/trace/bpf_trace.c | 36 +++++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 6 files changed, 38 insertions(+), 11 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f28b6527e815..4162afc6b5d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1116,6 +1116,7 @@ enum bpf_attach_type { BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, + BPF_TRACE_UPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8254b2973157..58190ca724a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4103,10 +4103,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -5359,7 +5363,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d8ed377b35d..132fc172961f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16027,6 +16027,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_KPROBE: switch (env->prog->expected_attach_type) { case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: range = retval_range(0, 1); break; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index db9e2792b42b..9c04b1364de2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1581,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } +static inline bool is_uprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_uprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1598,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_ip: if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: @@ -3096,6 +3107,7 @@ struct bpf_uprobe { u64 cookie; struct uprobe *uprobe; struct uprobe_consumer consumer; + bool session; }; struct bpf_uprobe_multi_link { @@ -3267,9 +3279,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; + int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + if (uprobe->session) + return ret ? UPROBE_HANDLER_IGNORE : 0; + return 0; } static int @@ -3279,7 +3295,8 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs); + return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) @@ -3318,7 +3335,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; @@ -3394,11 +3411,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].link = link; - if (flags & BPF_F_UPROBE_MULTI_RETURN) - uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; - else + if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; - + if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + if (is_uprobe_session(prog)) + uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f28b6527e815..4162afc6b5d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1116,6 +1116,7 @@ enum bpf_attach_type { BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, + BPF_TRACE_UPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 711173acbcef..faac1c79840d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -133,6 +133,7 @@ static const char * const attach_type_name[] = { [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", + [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 7 +++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdev.h | 1 + net/core/dev.c | 2 ++ net/core/dev.h | 25 +++++++++++++++++++++++++ net/core/netdev-genl-gen.c | 5 +++-- net/core/netdev-genl.c | 12 ++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 53 insertions(+), 2 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f9cb97d6106c..cbb544bd6c84 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -263,6 +263,11 @@ attribute-sets: the end of a NAPI cycle. This may add receive latency in exchange for reducing the number of frames processed by the network stack. type: uint + - + name: irq-suspend-timeout + doc: The timeout, in nanoseconds, of how long to suspend irq + processing, if event polling finds events + type: uint - name: queue attributes: @@ -653,6 +658,7 @@ operations: - pid - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout dump: request: attributes: @@ -704,6 +710,7 @@ operations: - id - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 6a31152e4606..4d910872963f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. It will be saved to the config * in napi_disable. @@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; n->config->napi_id = n->napi_id; napi_hash_del(n); } diff --git a/net/core/dev.h b/net/core/dev.h index 7881bced70a9..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, netdev->napi_config[i].gro_flush_timeout = timeout; } +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 21de7e10be16..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; /* Ops table for netdev */ @@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b49c3b4e5fbe..765ce7c9d73b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; @@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) @@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { + u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; @@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) napi_set_defer_hard_irqs(napi, defer); } + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 5229df8fb6796ff4aba39b16cf617028fa5d94b7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:40 -0800 Subject: tools headers: Sync uapi/linux/perf_event.h with the kernel sources To pick up the changes in this cset: 18d92bb57c39504d ("perf/core: Add aux_pause, aux_resume, aux_start_paused") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Link: https://lore.kernel.org/r/20241203035349.1901262-3-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4842c36fdf80..0524d541d4e3 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -511,7 +511,16 @@ struct perf_event_attr { __u16 sample_max_stack; __u16 __reserved_2; __u32 aux_sample_size; - __u32 __reserved_3; + + union { + __u32 aux_action; + struct { + __u32 aux_start_paused : 1, /* start AUX area tracing paused */ + aux_pause : 1, /* on overflow, pause AUX area tracing */ + aux_resume : 1, /* on overflow, resume AUX area tracing */ + __reserved_3 : 29; + }; + }; /* * User provided data if sigtrap=1, passed back to user via -- cgit v1.2.3 From e2064b7c5d82651a418b86078597e51dea3f3123 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Dec 2024 19:53:41 -0800 Subject: tools headers: Sync uapi/linux/kvm.h with the kernel sources To pick up the changes in this cset: e785dfacf7e7fe94 ("LoongArch: KVM: Add PCHPIC device support") 2e8b9df82631e714 ("LoongArch: KVM: Add EIOINTC device support") c532de5a67a70f85 ("LoongArch: KVM: Add IPI device support") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Please see tools/include/uapi/README for further details. Reviewed-by: James Clark Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Tianrui Zhao Cc: Bibo Mao Cc: Huacai Chen Cc: kvm@vger.kernel.org Cc: loongarch@lists.linux.dev Link: https://lore.kernel.org/r/20241203035349.1901262-4-namhyung@kernel.org Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/kvm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 637efc055145..502ea63b5d2e 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1158,7 +1158,15 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_LOONGARCH_IPI, +#define KVM_DEV_TYPE_LOONGARCH_IPI KVM_DEV_TYPE_LOONGARCH_IPI + KVM_DEV_TYPE_LOONGARCH_EIOINTC, +#define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC + KVM_DEV_TYPE_LOONGARCH_PCHPIC, +#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC + KVM_DEV_TYPE_MAX, + }; struct kvm_vfio_spapr_tce { -- cgit v1.2.3