From 62413a9c3cb183afb9bb6e94dd68caf4e4145f4c Mon Sep 17 00:00:00 2001
From: Paul Moses
Date: Mon, 23 Feb 2026 15:05:44 +0000
Subject: net/sched: act_gate: snapshot parameters with RCU on replace
The gate action can be replaced while the hrtimer callback or dump path is
walking the schedule list.
Convert the parameters to an RCU-protected snapshot and swap updates under
tcf_lock, freeing the previous snapshot via call_rcu(). When REPLACE omits
the entry list, preserve the existing schedule so the effective state is
unchanged.
Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
Cc: stable@vger.kernel.org
Signed-off-by: Paul Moses
Tested-by: Vladimir Oltean
Acked-by: Jamal Hadi Salim
Reviewed-by: Victor Nogueira
Link: https://patch.msgid.link/20260223150512.2251594-2-p@1g4.org
Signed-off-by: Jakub Kicinski
---
include/net/tc_act/tc_gate.h | 33 ++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)
(limited to 'include')
diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index b147a3bb1a46..e0fded18e18c 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -32,6 +32,7 @@ struct tcf_gate_params {
s32 tcfg_clockid;
size_t num_entries;
struct list_head entries;
+ struct rcu_head rcu;
};
#define GATE_ACT_GATE_OPEN BIT(0)
@@ -39,7 +40,7 @@ struct tcf_gate_params {
struct tcf_gate {
struct tc_action common;
- struct tcf_gate_params param;
+ struct tcf_gate_params __rcu *param;
u8 current_gate_status;
ktime_t current_close_time;
u32 current_entry_octets;
@@ -51,47 +52,65 @@ struct tcf_gate {
#define to_gate(a) ((struct tcf_gate *)a)
+static inline struct tcf_gate_params *tcf_gate_params_locked(const struct tc_action *a)
+{
+ struct tcf_gate *gact = to_gate(a);
+
+ return rcu_dereference_protected(gact->param,
+ lockdep_is_held(&gact->tcf_lock));
+}
+
static inline s32 tcf_gate_prio(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
s32 tcfg_prio;
- tcfg_prio = to_gate(a)->param.tcfg_priority;
+ p = tcf_gate_params_locked(a);
+ tcfg_prio = p->tcfg_priority;
return tcfg_prio;
}
static inline u64 tcf_gate_basetime(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_basetime;
- tcfg_basetime = to_gate(a)->param.tcfg_basetime;
+ p = tcf_gate_params_locked(a);
+ tcfg_basetime = p->tcfg_basetime;
return tcfg_basetime;
}
static inline u64 tcf_gate_cycletime(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_cycletime;
- tcfg_cycletime = to_gate(a)->param.tcfg_cycletime;
+ p = tcf_gate_params_locked(a);
+ tcfg_cycletime = p->tcfg_cycletime;
return tcfg_cycletime;
}
static inline u64 tcf_gate_cycletimeext(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_cycletimeext;
- tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext;
+ p = tcf_gate_params_locked(a);
+ tcfg_cycletimeext = p->tcfg_cycletime_ext;
return tcfg_cycletimeext;
}
static inline u32 tcf_gate_num_entries(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u32 num_entries;
- num_entries = to_gate(a)->param.num_entries;
+ p = tcf_gate_params_locked(a);
+ num_entries = p->num_entries;
return num_entries;
}
@@ -105,7 +124,7 @@ static inline struct action_gate_entry
u32 num_entries;
int i = 0;
- p = &to_gate(a)->param;
+ p = tcf_gate_params_locked(a);
num_entries = p->num_entries;
list_for_each_entry(entry, &p->entries, list)
--
cgit v1.2.3
From 29252397bcc1e0a1f85e5c3bee59c325f5c26341 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Wed, 25 Feb 2026 20:35:45 +0000
Subject: inet: annotate data-races around isk->inet_num
UDP/TCP lookups are using RCU, thus isk->inet_num accesses
should use READ_ONCE() and WRITE_ONCE() where needed.
Fixes: 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls")
Signed-off-by: Eric Dumazet
Reviewed-by: Kuniyuki Iwashima
Link: https://patch.msgid.link/20260225203545.1512417-1-edumazet@google.com
Signed-off-by: Jakub Kicinski
---
include/net/inet6_hashtables.h | 2 +-
include/net/inet_hashtables.h | 2 +-
include/net/ip.h | 2 +-
net/ipv4/inet_hashtables.c | 8 ++++----
net/ipv4/tcp_diag.c | 2 +-
net/ipv6/inet6_hashtables.c | 3 ++-
6 files changed, 10 insertions(+), 9 deletions(-)
(limited to 'include')
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 282e29237d93..c16de5b7963f 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -175,7 +175,7 @@ static inline bool inet6_match(const struct net *net, const struct sock *sk,
{
if (!net_eq(sock_net(sk), net) ||
sk->sk_family != AF_INET6 ||
- sk->sk_portpair != ports ||
+ READ_ONCE(sk->sk_portpair) != ports ||
!ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
return false;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ac05a52d9e13..5a979dcab538 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -345,7 +345,7 @@ static inline bool inet_match(const struct net *net, const struct sock *sk,
int dif, int sdif)
{
if (!net_eq(sock_net(sk), net) ||
- sk->sk_portpair != ports ||
+ READ_ONCE(sk->sk_portpair) != ports ||
sk->sk_addrpair != cookie)
return false;
diff --git a/include/net/ip.h b/include/net/ip.h
index 69d5cef46004..7f9abd457e01 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -101,7 +101,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
ipcm->addr = inet->inet_saddr;
- ipcm->protocol = inet->inet_num;
+ ipcm->protocol = READ_ONCE(inet->inet_num);
}
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fca980772c81..9bfccc283fa6 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -200,7 +200,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, unsigned short port)
{
- inet_sk(sk)->inet_num = port;
+ WRITE_ONCE(inet_sk(sk)->inet_num, port);
inet_csk(sk)->icsk_bind_hash = tb;
inet_csk(sk)->icsk_bind2_hash = tb2;
sk_add_bind_node(sk, &tb2->owners);
@@ -224,7 +224,7 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
inet_csk(sk)->icsk_bind_hash = NULL;
- inet_sk(sk)->inet_num = 0;
+ WRITE_ONCE(inet_sk(sk)->inet_num, 0);
sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
spin_lock(&head2->lock);
@@ -352,7 +352,7 @@ static inline int compute_score(struct sock *sk, const struct net *net,
{
int score = -1;
- if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
+ if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum &&
!ipv6_only_sock(sk)) {
if (sk->sk_rcv_saddr != daddr)
return -1;
@@ -1206,7 +1206,7 @@ error:
sk->sk_hash = 0;
inet_sk(sk)->inet_sport = 0;
- inet_sk(sk)->inet_num = 0;
+ WRITE_ONCE(inet_sk(sk)->inet_num, 0);
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index d83efd91f461..7935702e394b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -509,7 +509,7 @@ next_chunk:
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
+ if (r->id.idiag_sport != htons(READ_ONCE(sk->sk_num)) &&
r->id.idiag_sport)
goto next_normal;
if (r->id.idiag_dport != sk->sk_dport &&
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 5e1da088d8e1..182d38e6d6d8 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -95,7 +95,8 @@ static inline int compute_score(struct sock *sk, const struct net *net,
{
int score = -1;
- if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
+ if (net_eq(sock_net(sk), net) &&
+ READ_ONCE(inet_sk(sk)->inet_num) == hnum &&
sk->sk_family == PF_INET6) {
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
return -1;
--
cgit v1.2.3
From 11cb63b0d1a0685e0831ae3c77223e002ef18189 Mon Sep 17 00:00:00 2001
From: Victor Nogueira
Date: Wed, 25 Feb 2026 10:43:48 -0300
Subject: net/sched: Only allow act_ct to bind to clsact/ingress qdiscs and
shared blocks
As Paolo said earlier [1]:
"Since the blamed commit below, classify can return TC_ACT_CONSUMED while
the current skb being held by the defragmentation engine. As reported by
GangMin Kim, if such packet is that may cause a UaF when the defrag engine
later on tries to tuch again such packet."
act_ct was never meant to be used in the egress path, however some users
are attaching it to egress today [2]. Attempting to reach a middle
ground, we noticed that, while most qdiscs are not handling
TC_ACT_CONSUMED, clsact/ingress qdiscs are. With that in mind, we
address the issue by only allowing act_ct to bind to clsact/ingress
qdiscs and shared blocks. That way it's still possible to attach act_ct to
egress (albeit only with clsact).
[1] https://lore.kernel.org/netdev/674b8cbfc385c6f37fb29a1de08d8fe5c2b0fbee.1771321118.git.pabeni@redhat.com/
[2] https://lore.kernel.org/netdev/cc6bfb4a-4a2b-42d8-b9ce-7ef6644fb22b@ovn.org/
Reported-by: GangMin Kim
Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags")
CC: stable@vger.kernel.org
Signed-off-by: Victor Nogueira
Acked-by: Jamal Hadi Salim
Link: https://patch.msgid.link/20260225134349.1287037-1-victor@mojatatu.com
Signed-off-by: Jakub Kicinski
---
include/net/act_api.h | 1 +
net/sched/act_ct.c | 6 ++++++
net/sched/cls_api.c | 7 +++++++
3 files changed, 14 insertions(+)
(limited to 'include')
diff --git a/include/net/act_api.h b/include/net/act_api.h
index e1e8f0f7dacb..d11b79107930 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -70,6 +70,7 @@ struct tc_action {
#define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2))
#define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3))
#define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4))
+#define TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT (1U << (TCA_ACT_FLAGS_USER_BITS + 5))
/* Update lastuse only if needed, to avoid dirtying a cache line.
* We use a temp variable to avoid fetching jiffies twice.
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index bd51522c7953..7d5e50c921a0 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1360,6 +1360,12 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
+ if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Attaching ct to a non ingress/clsact qdisc is unsupported");
+ return -EOPNOTSUPP;
+ }
+
err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
if (err < 0)
return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 343309e9009b..4829c27446e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2228,6 +2228,11 @@ static bool is_qdisc_ingress(__u32 classid)
return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS));
}
+static bool is_ingress_or_clsact(struct tcf_block *block, struct Qdisc *q)
+{
+ return tcf_block_shared(block) || (q && !!(q->flags & TCQ_F_INGRESS));
+}
+
static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
{
@@ -2420,6 +2425,8 @@ replay:
flags |= TCA_ACT_FLAGS_NO_RTNL;
if (is_qdisc_ingress(parent))
flags |= TCA_ACT_FLAGS_AT_INGRESS;
+ if (is_ingress_or_clsact(block, q))
+ flags |= TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT;
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
flags, extack);
if (err == 0) {
--
cgit v1.2.3
From 15fba71533bcdfaa8eeba69a5a5a2927afdf664a Mon Sep 17 00:00:00 2001
From: Valentin Spreckels
Date: Thu, 26 Feb 2026 20:54:09 +0100
Subject: net: usb: r8152: add TRENDnet TUC-ET2G
The TRENDnet TUC-ET2G is a RTL8156 based usb ethernet adapter. Add its
vendor and product IDs.
Signed-off-by: Valentin Spreckels
Link: https://patch.msgid.link/20260226195409.7891-2-valentin@spreckels.dev
Signed-off-by: Jakub Kicinski
---
drivers/net/usb/r8152.c | 1 +
include/linux/usb/r8152.h | 1 +
2 files changed, 2 insertions(+)
(limited to 'include')
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 4af85728ac4f..0c83bbbea2e7 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -10054,6 +10054,7 @@ static const struct usb_device_id rtl8152_table[] = {
{ USB_DEVICE(VENDOR_ID_DLINK, 0xb301) },
{ USB_DEVICE(VENDOR_ID_DELL, 0xb097) },
{ USB_DEVICE(VENDOR_ID_ASUS, 0x1976) },
+ { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) },
{}
};
diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h
index 2ca60828f28b..1502b2a355f9 100644
--- a/include/linux/usb/r8152.h
+++ b/include/linux/usb/r8152.h
@@ -32,6 +32,7 @@
#define VENDOR_ID_DLINK 0x2001
#define VENDOR_ID_DELL 0x413c
#define VENDOR_ID_ASUS 0x0b05
+#define VENDOR_ID_TRENDNET 0x20f4
#if IS_REACHABLE(CONFIG_USB_RTL8152)
extern u8 rtl8152_get_version(struct usb_interface *intf);
--
cgit v1.2.3
From 60abb0ac11dccd6b98fd9182bc5f85b621688861 Mon Sep 17 00:00:00 2001
From: "Nikhil P. Rao"
Date: Wed, 25 Feb 2026 00:00:26 +0000
Subject: xsk: Fix fragment node deletion to prevent buffer leak
After commit b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node"),
the list_node field is reused for both the xskb pool list and the buffer
free list, this causes a buffer leak as described below.
xp_free() checks if a buffer is already on the free list using
list_empty(&xskb->list_node). When list_del() is used to remove a node
from the xskb pool list, it doesn't reinitialize the node pointers.
This means list_empty() will return false even after the node has been
removed, causing xp_free() to incorrectly skip adding the buffer to the
free list.
Fix this by using list_del_init() instead of list_del() in all fragment
handling paths, this ensures the list node is reinitialized after removal,
allowing the list_empty() to work correctly.
Fixes: b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node")
Acked-by: Maciej Fijalkowski
Signed-off-by: Nikhil P. Rao
Link: https://patch.msgid.link/20260225000456.107806-2-nikhil.rao@amd.com
Signed-off-by: Jakub Kicinski
---
include/net/xdp_sock_drv.h | 6 +++---
net/xdp/xsk.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
(limited to 'include')
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 242e34f771cc..aefc368449d5 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -122,7 +122,7 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
goto out;
list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
- list_del(&pos->list_node);
+ list_del_init(&pos->list_node);
xp_free(pos);
}
@@ -157,7 +157,7 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first)
frag = list_first_entry_or_null(&xskb->pool->xskb_list,
struct xdp_buff_xsk, list_node);
if (frag) {
- list_del(&frag->list_node);
+ list_del_init(&frag->list_node);
ret = &frag->xdp;
}
@@ -168,7 +168,7 @@ static inline void xsk_buff_del_frag(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
- list_del(&xskb->list_node);
+ list_del_init(&xskb->list_node);
}
static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3b46bc635c43..be882a8d473c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -186,7 +186,7 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
err = __xsk_rcv_zc(xs, pos, len, contd);
if (err)
goto err;
- list_del(&pos->list_node);
+ list_del_init(&pos->list_node);
}
return 0;
--
cgit v1.2.3
From 479d589b40b836442bbdadc3fdb37f001bb67f26 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen
Date: Thu, 26 Feb 2026 16:03:01 +0800
Subject: bpf/bonding: reject vlan+srcmac xmit_hash_policy change when XDP is
loaded
bond_option_mode_set() already rejects mode changes that would make a
loaded XDP program incompatible via bond_xdp_check(). However,
bond_option_xmit_hash_policy_set() has no such guard.
For 802.3ad and balance-xor modes, bond_xdp_check() returns false when
xmit_hash_policy is vlan+srcmac, because the 802.1q payload is usually
absent due to hardware offload. This means a user can:
1. Attach a native XDP program to a bond in 802.3ad/balance-xor mode
with a compatible xmit_hash_policy (e.g. layer2+3).
2. Change xmit_hash_policy to vlan+srcmac while XDP remains loaded.
This leaves bond->xdp_prog set but bond_xdp_check() now returning false
for the same device. When the bond is later destroyed, dev_xdp_uninstall()
calls bond_xdp_set(dev, NULL, NULL) to remove the program, which hits
the bond_xdp_check() guard and returns -EOPNOTSUPP, triggering:
WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL))
Fix this by rejecting xmit_hash_policy changes to vlan+srcmac when an
XDP program is loaded on a bond in 802.3ad or balance-xor mode.
commit 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP")
introduced bond_xdp_check() which returns false for 802.3ad/balance-xor
modes when xmit_hash_policy is vlan+srcmac. The check was wired into
bond_xdp_set() to reject XDP attachment with an incompatible policy, but
the symmetric path -- preventing xmit_hash_policy from being changed to an
incompatible value after XDP is already loaded -- was left unguarded in
bond_option_xmit_hash_policy_set().
Note:
commit 094ee6017ea0 ("bonding: check xdp prog when set bond mode")
later added a similar guard to bond_option_mode_set(), but
bond_option_xmit_hash_policy_set() remained unprotected.
Reported-by: syzbot+5a287bcdc08104bc3132@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6995aff6.050a0220.2eeac1.014e.GAE@google.com/T/
Fixes: 39a0876d595b ("net, bonding: Disallow vlan+srcmac with XDP")
Signed-off-by: Jiayuan Chen
Link: https://patch.msgid.link/20260226080306.98766-2-jiayuan.chen@linux.dev
Signed-off-by: Paolo Abeni
---
drivers/net/bonding/bond_main.c | 9 +++++++--
drivers/net/bonding/bond_options.c | 2 ++
include/net/bonding.h | 1 +
3 files changed, 10 insertions(+), 2 deletions(-)
(limited to 'include')
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a1de08ee3815..14ed91391fcc 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -324,7 +324,7 @@ static bool bond_sk_check(struct bonding *bond)
}
}
-bool bond_xdp_check(struct bonding *bond, int mode)
+bool __bond_xdp_check(int mode, int xmit_policy)
{
switch (mode) {
case BOND_MODE_ROUNDROBIN:
@@ -335,7 +335,7 @@ bool bond_xdp_check(struct bonding *bond, int mode)
/* vlan+srcmac is not supported with XDP as in most cases the 802.1q
* payload is not in the packet due to hardware offload.
*/
- if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
+ if (xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
return true;
fallthrough;
default:
@@ -343,6 +343,11 @@ bool bond_xdp_check(struct bonding *bond, int mode)
}
}
+bool bond_xdp_check(struct bonding *bond, int mode)
+{
+ return __bond_xdp_check(mode, bond->params.xmit_policy);
+}
+
/*---------------------------------- VLAN -----------------------------------*/
/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index dcee384c2f06..7380cc4ee75a 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1575,6 +1575,8 @@ static int bond_option_fail_over_mac_set(struct bonding *bond,
static int bond_option_xmit_hash_policy_set(struct bonding *bond,
const struct bond_opt_value *newval)
{
+ if (bond->xdp_prog && !__bond_xdp_check(BOND_MODE(bond), newval->value))
+ return -EOPNOTSUPP;
netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n",
newval->string, newval->value);
bond->params.xmit_policy = newval->value;
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 4ad5521e7731..395c6e281c5f 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -699,6 +699,7 @@ void bond_debug_register(struct bonding *bond);
void bond_debug_unregister(struct bonding *bond);
void bond_debug_reregister(struct bonding *bond);
const char *bond_mode_name(int mode);
+bool __bond_xdp_check(int mode, int xmit_policy);
bool bond_xdp_check(struct bonding *bond, int mode);
void bond_setup(struct net_device *bond_dev);
unsigned int bond_get_num_tx_queues(void);
--
cgit v1.2.3
From 710f5c76580306cdb9ec51fac8fcf6a8faff7821 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 27 Feb 2026 17:26:03 +0000
Subject: indirect_call_wrapper: do not reevaluate function pointer
We have an increasing number of READ_ONCE(xxx->function)
combined with INDIRECT_CALL_[1234]() helpers.
Unfortunately this forces INDIRECT_CALL_[1234]() to read
xxx->function many times, which is not what we wanted.
Fix these macros so that xxx->function value is not reloaded.
$ scripts/bloat-o-meter -t vmlinux.0 vmlinux
add/remove: 0/0 grow/shrink: 1/65 up/down: 122/-1084 (-962)
Function old new delta
ip_push_pending_frames 59 181 +122
ip6_finish_output 687 681 -6
__udp_enqueue_schedule_skb 1078 1072 -6
ioam6_output 2319 2312 -7
xfrm4_rcv_encap_finish2 64 56 -8
xfrm4_output 297 289 -8
vrf_ip_local_out 278 270 -8
vrf_ip6_local_out 278 270 -8
seg6_input_finish 64 56 -8
rpl_output 700 692 -8
ipmr_forward_finish 124 116 -8
ip_forward_finish 143 135 -8
ip6mr_forward2_finish 100 92 -8
ip6_forward_finish 73 65 -8
input_action_end_bpf 1091 1083 -8
dst_input 52 44 -8
__xfrm6_output 801 793 -8
__xfrm4_output 83 75 -8
bpf_input 500 491 -9
__tcp_check_space 530 521 -9
input_action_end_dt6 291 280 -11
vti6_tnl_xmit 1634 1622 -12
bpf_xmit 1203 1191 -12
rpl_input 497 483 -14
rawv6_send_hdrinc 1355 1341 -14
ndisc_send_skb 1030 1016 -14
ipv6_srh_rcv 1377 1363 -14
ip_send_unicast_reply 1253 1239 -14
ip_rcv_finish 226 212 -14
ip6_rcv_finish 300 286 -14
input_action_end_x_core 205 191 -14
input_action_end_x 355 341 -14
input_action_end_t 205 191 -14
input_action_end_dx6_finish 127 113 -14
input_action_end_dx4_finish 373 359 -14
input_action_end_dt4 426 412 -14
input_action_end_core 186 172 -14
input_action_end_b6_encap 292 278 -14
input_action_end_b6 198 184 -14
igmp6_send 1332 1318 -14
ip_sublist_rcv 864 848 -16
ip6_sublist_rcv 1091 1075 -16
ipv6_rpl_srh_rcv 1937 1920 -17
xfrm_policy_queue_process 1246 1228 -18
seg6_output_core 903 885 -18
mld_sendpack 856 836 -20
NF_HOOK 756 736 -20
vti_tunnel_xmit 1447 1426 -21
input_action_end_dx6 664 642 -22
input_action_end 1502 1480 -22
sock_sendmsg_nosec 134 111 -23
ip6mr_forward2 388 364 -24
sock_recvmsg_nosec 134 109 -25
seg6_input_core 836 810 -26
ip_send_skb 172 146 -26
ip_local_out 140 114 -26
ip6_local_out 140 114 -26
__sock_sendmsg 162 136 -26
__ip_queue_xmit 1196 1170 -26
__ip_finish_output 405 379 -26
ipmr_queue_fwd_xmit 373 346 -27
sock_recvmsg 173 145 -28
ip6_xmit 1635 1607 -28
xfrm_output_resume 1418 1389 -29
ip_build_and_send_pkt 625 591 -34
dst_output 504 432 -72
Total: Before=25217686, After=25216724, chg -0.00%
Fixes: 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin")
Signed-off-by: Eric Dumazet
Reviewed-by: Kuniyuki Iwashima
Link: https://patch.msgid.link/20260227172603.1700433-1-edumazet@google.com
Signed-off-by: Paolo Abeni
---
include/linux/indirect_call_wrapper.h | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
(limited to 'include')
diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index 35227d47cfc9..dc272b514a01 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -16,22 +16,26 @@
*/
#define INDIRECT_CALL_1(f, f1, ...) \
({ \
- likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \
+ typeof(f) __f1 = (f); \
+ likely(__f1 == f1) ? f1(__VA_ARGS__) : __f1(__VA_ARGS__); \
})
#define INDIRECT_CALL_2(f, f2, f1, ...) \
({ \
- likely(f == f2) ? f2(__VA_ARGS__) : \
- INDIRECT_CALL_1(f, f1, __VA_ARGS__); \
+ typeof(f) __f2 = (f); \
+ likely(__f2 == f2) ? f2(__VA_ARGS__) : \
+ INDIRECT_CALL_1(__f2, f1, __VA_ARGS__); \
})
#define INDIRECT_CALL_3(f, f3, f2, f1, ...) \
({ \
- likely(f == f3) ? f3(__VA_ARGS__) : \
- INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__); \
+ typeof(f) __f3 = (f); \
+ likely(__f3 == f3) ? f3(__VA_ARGS__) : \
+ INDIRECT_CALL_2(__f3, f2, f1, __VA_ARGS__); \
})
#define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) \
({ \
- likely(f == f4) ? f4(__VA_ARGS__) : \
- INDIRECT_CALL_3(f, f3, f2, f1, __VA_ARGS__); \
+ typeof(f) __f4 = (f); \
+ likely(__f4 == f4) ? f4(__VA_ARGS__) : \
+ INDIRECT_CALL_3(__f4, f3, f2, f1, __VA_ARGS__); \
})
#define INDIRECT_CALLABLE_DECLARE(f) f
--
cgit v1.2.3
From 4ee7fa6cf78ff26d783d39e2949d14c4c1cd5e7f Mon Sep 17 00:00:00 2001
From: Yung Chih Su
Date: Mon, 2 Mar 2026 14:02:47 +0800
Subject: net: ipv4: fix ARM64 alignment fault in multipath hash seed
`struct sysctl_fib_multipath_hash_seed` contains two u32 fields
(user_seed and mp_seed), making it an 8-byte structure with a 4-byte
alignment requirement.
In `fib_multipath_hash_from_keys()`, the code evaluates the entire
struct atomically via `READ_ONCE()`:
mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;
While this silently works on GCC by falling back to unaligned regular
loads which the ARM64 kernel tolerates, it causes a fatal kernel panic
when compiled with Clang and LTO enabled.
Commit e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire
when CONFIG_LTO=y") strengthens `READ_ONCE()` to use Load-Acquire
instructions (`ldar` / `ldapr`) to prevent compiler reordering bugs
under Clang LTO. Since the macro evaluates the full 8-byte struct,
Clang emits a 64-bit `ldar` instruction. ARM64 architecture strictly
requires `ldar` to be naturally aligned, thus executing it on a 4-byte
aligned address triggers a strict Alignment Fault (FSC = 0x21).
Fix the read side by moving the `READ_ONCE()` directly to the `u32`
member, which emits a safe 32-bit `ldar Wn`.
Furthermore, Eric Dumazet pointed out that `WRITE_ONCE()` on the entire
struct in `proc_fib_multipath_hash_set_seed()` is also flawed. Analysis
shows that Clang splits this 8-byte write into two separate 32-bit
`str` instructions. While this avoids an alignment fault, it destroys
atomicity and exposes a tear-write vulnerability. Fix this by
explicitly splitting the write into two 32-bit `WRITE_ONCE()`
operations.
Finally, add the missing `READ_ONCE()` when reading `user_seed` in
`proc_fib_multipath_hash_seed()` to ensure proper pairing and
concurrency safety.
Fixes: 4ee2a8cace3f ("net: ipv4: Add a sysctl to set multipath hash seed")
Signed-off-by: Yung Chih Su
Reviewed-by: Eric Dumazet
Link: https://patch.msgid.link/20260302060247.7066-1-yuuchihsu@gmail.com
Signed-off-by: Jakub Kicinski
---
include/net/ip_fib.h | 2 +-
net/ipv4/sysctl_net_ipv4.c | 5 +++--
2 files changed, 4 insertions(+), 3 deletions(-)
(limited to 'include')
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index b4495c38e0a0..318593743b6e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -559,7 +559,7 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
siphash_aligned_key_t hash_key;
u32 mp_seed;
- mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;
+ mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed);
fib_multipath_hash_construct_key(&hash_key, mp_seed);
return flow_hash_from_keys_seed(keys, &hash_key);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 643763bc2142..5654cc9c8a0b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -486,7 +486,8 @@ static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
proc_fib_multipath_hash_rand_seed),
};
- WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new);
+ WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed);
+ WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed);
}
static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write,
@@ -500,7 +501,7 @@ static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write
int ret;
mphs = &net->ipv4.sysctl_fib_multipath_hash_seed;
- user_seed = mphs->user_seed;
+ user_seed = READ_ONCE(mphs->user_seed);
tmp = *table;
tmp.data = &user_seed;
--
cgit v1.2.3
From 7f083faf59d14c04e01ec05a7507f036c965acf8 Mon Sep 17 00:00:00 2001
From: Koichiro Den
Date: Sat, 28 Feb 2026 23:53:07 +0900
Subject: net: sched: avoid qdisc_reset_all_tx_gt() vs dequeue race for
lockless qdiscs
When shrinking the number of real tx queues,
netif_set_real_num_tx_queues() calls qdisc_reset_all_tx_gt() to flush
qdiscs for queues which will no longer be used.
qdisc_reset_all_tx_gt() currently serializes qdisc_reset() with
qdisc_lock(). However, for lockless qdiscs, the dequeue path is
serialized by qdisc_run_begin/end() using qdisc->seqlock instead, so
qdisc_reset() can run concurrently with __qdisc_run() and free skbs
while they are still being dequeued, leading to UAF.
This can easily be reproduced on e.g. virtio-net by imposing heavy
traffic while frequently changing the number of queue pairs:
iperf3 -ub0 -c $peer -t 0 &
while :; do
ethtool -L eth0 combined 1
ethtool -L eth0 combined 2
done
With KASAN enabled, this leads to reports like:
BUG: KASAN: slab-use-after-free in __qdisc_run+0x133f/0x1760
...
Call Trace:
...
__qdisc_run+0x133f/0x1760
__dev_queue_xmit+0x248f/0x3550
ip_finish_output2+0xa42/0x2110
ip_output+0x1a7/0x410
ip_send_skb+0x2e6/0x480
udp_send_skb+0xb0a/0x1590
udp_sendmsg+0x13c9/0x1fc0
...
Allocated by task 1270 on cpu 5 at 44.558414s:
...
alloc_skb_with_frags+0x84/0x7c0
sock_alloc_send_pskb+0x69a/0x830
__ip_append_data+0x1b86/0x48c0
ip_make_skb+0x1e8/0x2b0
udp_sendmsg+0x13a6/0x1fc0
...
Freed by task 1306 on cpu 3 at 44.558445s:
...
kmem_cache_free+0x117/0x5e0
pfifo_fast_reset+0x14d/0x580
qdisc_reset+0x9e/0x5f0
netif_set_real_num_tx_queues+0x303/0x840
virtnet_set_channels+0x1bf/0x260 [virtio_net]
ethnl_set_channels+0x684/0xae0
ethnl_default_set_doit+0x31a/0x890
...
Serialize qdisc_reset_all_tx_gt() against the lockless dequeue path by
taking qdisc->seqlock for TCQ_F_NOLOCK qdiscs, matching the
serialization model already used by dev_reset_queue().
Additionally clear QDISC_STATE_NON_EMPTY after reset so the qdisc state
reflects an empty queue, avoiding needless re-scheduling.
Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Koichiro Den
Link: https://patch.msgid.link/20260228145307.3955532-1-den@valinux.co.jp
Signed-off-by: Jakub Kicinski
---
include/net/sch_generic.h | 10 ++++++++++
1 file changed, 10 insertions(+)
(limited to 'include')
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c3a7268b567e..d5d55cb21686 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -778,13 +778,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb)
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
struct Qdisc *qdisc;
+ bool nolock;
for (; i < dev->num_tx_queues; i++) {
qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
if (qdisc) {
+ nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
spin_lock_bh(qdisc_lock(qdisc));
qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock) {
+ clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+ clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+ spin_unlock_bh(&qdisc->seqlock);
+ }
}
}
}
--
cgit v1.2.3
From 165573e41f2f66ef98940cf65f838b2cb575d9d1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Mon, 2 Mar 2026 20:55:27 +0000
Subject: tcp: secure_seq: add back ports to TS offset
This reverts 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets")
tcp_tw_recycle went away in 2017.
Zhouyan Deng reported off-path TCP source port leakage via
SYN cookie side-channel that can be fixed in multiple ways.
One of them is to bring back TCP ports in TS offset randomization.
As a bonus, we perform a single siphash() computation
to provide both an ISN and a TS offset.
Fixes: 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets")
Reported-by: Zhouyan Deng
Signed-off-by: Eric Dumazet
Reviewed-by: Kuniyuki Iwashima
Acked-by: Florian Westphal
Link: https://patch.msgid.link/20260302205527.1982836-1-edumazet@google.com
Signed-off-by: Jakub Kicinski
---
include/net/secure_seq.h | 45 ++++++++++++++++++++++-----
include/net/tcp.h | 6 ++--
net/core/secure_seq.c | 80 ++++++++++++++++++------------------------------
net/ipv4/syncookies.c | 11 +++++--
net/ipv4/tcp_input.c | 8 +++--
net/ipv4/tcp_ipv4.c | 37 ++++++++++------------
net/ipv6/syncookies.c | 11 +++++--
net/ipv6/tcp_ipv6.c | 37 ++++++++++------------
8 files changed, 127 insertions(+), 108 deletions(-)
(limited to 'include')
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index cddebafb9f77..6f996229167b 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -5,16 +5,47 @@
#include
struct net;
+extern struct net init_net;
+
+union tcp_seq_and_ts_off {
+ struct {
+ u32 seq;
+ u32 ts_off;
+ };
+ u64 hash64;
+};
u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport);
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport);
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr);
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport);
-u32 secure_tcpv6_ts_off(const struct net *net,
- const __be32 *saddr, const __be32 *daddr);
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport);
+
+static inline u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ union tcp_seq_and_ts_off ts;
+
+ ts = secure_tcp_seq_and_ts_off(&init_net, saddr, daddr,
+ sport, dport);
+
+ return ts.seq;
+}
+
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+ const __be32 *daddr,
+ __be16 sport, __be16 dport);
+
+static inline u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ union tcp_seq_and_ts_off ts;
+
+ ts = secure_tcpv6_seq_and_ts_off(&init_net, saddr, daddr,
+ sport, dport);
+ return ts.seq;
+}
#endif /* _NET_SECURE_SEQ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eb8bf63fdafc..978eea2d5df0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
#include
#include
#include
+#include
#include
#include
@@ -2464,8 +2465,9 @@ struct tcp_request_sock_ops {
struct flowi *fl,
struct request_sock *req,
u32 tw_isn);
- u32 (*init_seq)(const struct sk_buff *skb);
- u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
+ union tcp_seq_and_ts_off (*init_seq_and_ts_off)(
+ const struct net *net,
+ const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
struct tcp_fastopen_cookie *foc,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 9a3965680451..6a6f2cda5aae 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -20,7 +20,6 @@
#include
static siphash_aligned_key_t net_secret;
-static siphash_aligned_key_t ts_secret;
#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
@@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void)
{
net_get_random_once(&net_secret, sizeof(net_secret));
}
-
-static __always_inline void ts_secret_init(void)
-{
- net_get_random_once(&ts_secret, sizeof(ts_secret));
-}
#endif
#ifdef CONFIG_INET
@@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-u32 secure_tcpv6_ts_off(const struct net *net,
- const __be32 *saddr, const __be32 *daddr)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- };
-
- if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
- return 0;
-
- ts_secret_init();
- return siphash(&combined, offsetofend(typeof(combined), daddr),
- &ts_secret);
-}
-EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
-
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+ const __be32 *daddr, __be16 sport, __be16 dport)
{
const struct {
struct in6_addr saddr;
@@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
.sport = sport,
.dport = dport
};
- u32 hash;
+ union tcp_seq_and_ts_off st;
net_secret_init();
- hash = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- return seq_scale(hash);
+
+ st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ st.ts_off = 0;
+
+ st.seq = seq_scale(st.seq);
+ return st;
}
-EXPORT_SYMBOL(secure_tcpv6_seq);
+EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off);
u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
@@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
-{
- if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
- return 0;
-
- ts_secret_init();
- return siphash_2u32((__force u32)saddr, (__force u32)daddr,
- &ts_secret);
-}
-
/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
* but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
* it would be easy enough to have the former function use siphash_4u32, passing
* the arguments as separate u32.
*/
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
{
- u32 hash;
+ u32 ports = (__force u32)sport << 16 | (__force u32)dport;
+ union tcp_seq_and_ts_off st;
net_secret_init();
- hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- return seq_scale(hash);
+
+ st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ ports, &net_secret);
+
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ st.ts_off = 0;
+
+ st.seq = seq_scale(st.seq);
+ return st;
}
-EXPORT_SYMBOL_GPL(secure_tcp_seq);
+EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off);
u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 061751aabc8e..fc3affd9c801 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -378,9 +378,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
- tsoff = secure_tcp_ts_off(net,
- ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr);
+ union tcp_seq_and_ts_off st;
+
+ st = secure_tcp_seq_and_ts_off(net,
+ ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
+ tsoff = st.ts_off;
tcp_opt.rcv_tsecr -= tsoff;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7b03f2460751..cba89733d121 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7646,6 +7646,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
+ union tcp_seq_and_ts_off st;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
@@ -7715,9 +7716,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (!dst)
goto drop_and_free;
+ if (tmp_opt.tstamp_ok || (!want_cookie && !isn))
+ st = af_ops->init_seq_and_ts_off(net, skb);
+
if (tmp_opt.tstamp_ok) {
tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
- tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+ tcp_rsk(req)->ts_off = st.ts_off;
}
if (!want_cookie && !isn) {
int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
@@ -7739,7 +7743,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb);
+ isn = st.seq;
}
tcp_ecn_create_request(req, skb, sk, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 910c25cb24e1..c7b2463c2e25 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -105,17 +105,14 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
static DEFINE_MUTEX(tcp_exit_batch_mutex);
-static u32 tcp_v4_init_seq(const struct sk_buff *skb)
+static union tcp_seq_and_ts_off
+tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcp_seq(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr,
- tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
-}
-
-static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
-{
- return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+ return secure_tcp_seq_and_ts_off(net,
+ ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -327,15 +324,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len
rt = NULL;
if (likely(!tp->repair)) {
+ union tcp_seq_and_ts_off st;
+
+ st = secure_tcp_seq_and_ts_off(net,
+ inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port);
if (!tp->write_seq)
- WRITE_ONCE(tp->write_seq,
- secure_tcp_seq(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port));
- WRITE_ONCE(tp->tsoffset,
- secure_tcp_ts_off(net, inet->inet_saddr,
- inet->inet_daddr));
+ WRITE_ONCE(tp->write_seq, st.seq);
+ WRITE_ONCE(tp->tsoffset, st.ts_off);
}
atomic_set(&inet->inet_id, get_random_u16());
@@ -1677,8 +1675,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.cookie_init_seq = cookie_v4_init_sequence,
#endif
.route_req = tcp_v4_route_req,
- .init_seq = tcp_v4_init_seq,
- .init_ts_off = tcp_v4_init_ts_off,
+ .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off,
.send_synack = tcp_v4_send_synack,
};
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7e007f013ec8..4f6f0d751d6c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -151,9 +151,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
- tsoff = secure_tcpv6_ts_off(net,
- ipv6_hdr(skb)->daddr.s6_addr32,
- ipv6_hdr(skb)->saddr.s6_addr32);
+ union tcp_seq_and_ts_off st;
+
+ st = secure_tcpv6_seq_and_ts_off(net,
+ ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
+ tsoff = st.ts_off;
tcp_opt.rcv_tsecr -= tsoff;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5195a46b951e..bb09d5ccf599 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -105,18 +105,14 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
}
}
-static u32 tcp_v6_init_seq(const struct sk_buff *skb)
+static union tcp_seq_and_ts_off
+tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
- ipv6_hdr(skb)->saddr.s6_addr32,
- tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
-}
-
-static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
-{
- return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32,
- ipv6_hdr(skb)->saddr.s6_addr32);
+ return secure_tcpv6_seq_and_ts_off(net,
+ ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
}
static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
@@ -320,14 +316,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
sk_set_txhash(sk);
if (likely(!tp->repair)) {
+ union tcp_seq_and_ts_off st;
+
+ st = secure_tcpv6_seq_and_ts_off(net,
+ np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport);
if (!tp->write_seq)
- WRITE_ONCE(tp->write_seq,
- secure_tcpv6_seq(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport));
- tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32);
+ WRITE_ONCE(tp->write_seq, st.seq);
+ tp->tsoffset = st.ts_off;
}
if (tcp_fastopen_defer_connect(sk, &err))
@@ -817,8 +815,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.cookie_init_seq = cookie_v6_init_sequence,
#endif
.route_req = tcp_v6_route_req,
- .init_seq = tcp_v6_init_seq,
- .init_ts_off = tcp_v6_init_ts_off,
+ .init_seq_and_ts_off = tcp_v6_init_seq_and_ts_off,
.send_synack = tcp_v6_send_synack,
};
--
cgit v1.2.3
From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior
Date: Mon, 2 Mar 2026 17:26:31 +0100
Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock
After acquiring netdev_queue::_xmit_lock the number of the CPU owning
the lock is recorded in netdev_queue::xmit_lock_owner. This works as
long as the BH context is not preemptible.
On PREEMPT_RT the softirq context is preemptible and without the
softirq-lock it is possible to have multiple user in __dev_queue_xmit()
submitting a skb on the same CPU. This is fine in general but this means
also that the current CPU is recorded as netdev_queue::xmit_lock_owner.
This in turn leads to the recursion alert and the skb is dropped.
Instead checking the for CPU number, that owns the lock, PREEMPT_RT can
check if the lockowner matches the current task.
Add netif_tx_owned() which returns true if the current context owns the
lock by comparing the provided CPU number with the recorded number. This
resembles the current check by negating the condition (the current check
returns true if the lock is not owned).
On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare
the current task against it.
Use the new helper in __dev_queue_xmit() and netif_local_xmit_active()
which provides a similar check.
Update comments regarding pairing READ_ONCE().
Reported-by: Bert Karwatzki
Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de
Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT")
Signed-off-by: Sebastian Andrzej Siewior
Reported-by: Bert Karwatzki
Signed-off-by: Sebastian Andrzej Siewior
Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de
Signed-off-by: Paolo Abeni
---
include/linux/netdevice.h | 27 ++++++++++++++++++++++-----
net/core/dev.c | 5 +----
net/core/netpoll.c | 2 +-
3 files changed, 24 insertions(+), 10 deletions(-)
(limited to 'include')
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d4e6e00bb90a..67e25f6d15a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4711,7 +4711,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
spin_lock(&txq->_xmit_lock);
- /* Pairs with READ_ONCE() in __dev_queue_xmit() */
+ /* Pairs with READ_ONCE() in netif_tx_owned() */
WRITE_ONCE(txq->xmit_lock_owner, cpu);
}
@@ -4729,7 +4729,7 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
spin_lock_bh(&txq->_xmit_lock);
- /* Pairs with READ_ONCE() in __dev_queue_xmit() */
+ /* Pairs with READ_ONCE() in netif_tx_owned() */
WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}
@@ -4738,7 +4738,7 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq)
bool ok = spin_trylock(&txq->_xmit_lock);
if (likely(ok)) {
- /* Pairs with READ_ONCE() in __dev_queue_xmit() */
+ /* Pairs with READ_ONCE() in netif_tx_owned() */
WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}
return ok;
@@ -4746,14 +4746,14 @@ static inline bool __netif_tx_trylock(struct netdev_queue *txq)
static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
- /* Pairs with READ_ONCE() in __dev_queue_xmit() */
+ /* Pairs with READ_ONCE() in netif_tx_owned() */
WRITE_ONCE(txq->xmit_lock_owner, -1);
spin_unlock(&txq->_xmit_lock);
}
static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
- /* Pairs with READ_ONCE() in __dev_queue_xmit() */
+ /* Pairs with READ_ONCE() in netif_tx_owned() */
WRITE_ONCE(txq->xmit_lock_owner, -1);
spin_unlock_bh(&txq->_xmit_lock);
}
@@ -4846,6 +4846,23 @@ static inline void netif_tx_disable(struct net_device *dev)
local_bh_enable();
}
+#ifndef CONFIG_PREEMPT_RT
+static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
+{
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ return READ_ONCE(txq->xmit_lock_owner) == cpu;
+}
+
+#else
+static inline bool netif_tx_owned(struct netdev_queue *txq, unsigned int cpu)
+{
+ return rt_mutex_owner(&txq->_xmit_lock.lock) == current;
+}
+
+#endif
+
static inline void netif_addr_lock(struct net_device *dev)
{
unsigned char nest_level = 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index 20610a192ec7..14a83f2035b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4818,10 +4818,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- /* Other cpus might concurrently change txq->xmit_lock_owner
- * to -1 or to their cpu id, but not to our id.
- */
- if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+ if (!netif_tx_owned(txq, cpu)) {
bool is_list = false;
if (dev_xmit_recursion())
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a8558a52884f..cd74beffd209 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev)
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+ if (netif_tx_owned(txq, smp_processor_id()))
return 1;
}
--
cgit v1.2.3
From fb7fb4016300ac622c964069e286dc83166a5d52 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Mon, 2 Mar 2026 23:28:15 +0100
Subject: netfilter: nf_tables: clone set on flush only
Syzbot with fault injection triggered a failing memory allocation with
GFP_KERNEL which results in a WARN splat:
iter.err
WARNING: net/netfilter/nf_tables_api.c:845 at nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845, CPU#0: syz.0.17/5992
Modules linked in:
CPU: 0 UID: 0 PID: 5992 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026
RIP: 0010:nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845
Code: 8b 05 86 5a 4e 09 48 3b 84 24 a0 00 00 00 75 62 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc e8 63 6d fa f7 90 <0f> 0b 90 43
+80 7c 35 00 00 0f 85 23 fe ff ff e9 26 fe ff ff 89 d9
RSP: 0018:ffffc900045af780 EFLAGS: 00010293
RAX: ffffffff89ca45bd RBX: 00000000fffffff4 RCX: ffff888028111e40
RDX: 0000000000000000 RSI: 00000000fffffff4 RDI: 0000000000000000
RBP: ffffc900045af870 R08: 0000000000400dc0 R09: 00000000ffffffff
R10: dffffc0000000000 R11: fffffbfff1d141db R12: ffffc900045af7e0
R13: 1ffff920008b5f24 R14: dffffc0000000000 R15: ffffc900045af920
FS: 000055557a6a5500(0000) GS:ffff888125496000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fb5ea271fc0 CR3: 000000003269e000 CR4: 00000000003526f0
Call Trace:
__nft_release_table+0xceb/0x11f0 net/netfilter/nf_tables_api.c:12115
nft_rcv_nl_event+0xc25/0xdb0 net/netfilter/nf_tables_api.c:12187
notifier_call_chain+0x19d/0x3a0 kernel/notifier.c:85
blocking_notifier_call_chain+0x6a/0x90 kernel/notifier.c:380
netlink_release+0x123b/0x1ad0 net/netlink/af_netlink.c:761
__sock_release net/socket.c:662 [inline]
sock_close+0xc3/0x240 net/socket.c:1455
Restrict set clone to the flush set command in the preparation phase.
Add NFT_ITER_UPDATE_CLONE and use it for this purpose, update the rbtree
and pipapo backends to only clone the set when this iteration type is
used.
As for the existing NFT_ITER_UPDATE type, update the pipapo backend to
use the existing set clone if available, otherwise use the existing set
representation. After this update, there is no need to clone a set that
is being deleted, this includes bound anonymous set.
An alternative approach to NFT_ITER_UPDATE_CLONE is to add a .clone
interface and call it from the flush set path.
Reported-by: syzbot+4924a0edc148e8b4b342@syzkaller.appspotmail.com
Fixes: 3f1d886cc7c3 ("netfilter: nft_set_pipapo: move cloning of match info to insert/removal path")
Signed-off-by: Pablo Neira Ayuso
Signed-off-by: Florian Westphal
---
include/net/netfilter/nf_tables.h | 2 ++
net/netfilter/nf_tables_api.c | 10 +++++++++-
net/netfilter/nft_set_hash.c | 1 +
net/netfilter/nft_set_pipapo.c | 11 +++++++++--
net/netfilter/nft_set_rbtree.c | 8 +++++---
5 files changed, 26 insertions(+), 6 deletions(-)
(limited to 'include')
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 426534a711b0..ea6f29ad7888 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -320,11 +320,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
* @NFT_ITER_UNSPEC: unspecified, to catch errors
* @NFT_ITER_READ: read-only iteration over set elements
* @NFT_ITER_UPDATE: iteration under mutex to update set element state
+ * @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element
*/
enum nft_iter_type {
NFT_ITER_UNSPEC,
NFT_ITER_READ,
NFT_ITER_UPDATE,
+ NFT_ITER_UPDATE_CLONE,
};
struct nft_set;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index df67932d3e09..058f7004cb2b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -833,6 +833,11 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
}
}
+/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation
+ * phase, the set clone might already exist from a previous command, or it might
+ * be a set that is going away and does not require a clone. The netns and
+ * netlink release paths also need to work on the live set.
+ */
static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
{
struct nft_set_iter iter = {
@@ -7903,9 +7908,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
{
+ /* The set backend might need to clone the set, do it now from the
+ * preparation phase, use NFT_ITER_UPDATE_CLONE iterator type.
+ */
struct nft_set_iter iter = {
.genmask = genmask,
- .type = NFT_ITER_UPDATE,
+ .type = NFT_ITER_UPDATE_CLONE,
.fn = nft_setelem_flush,
};
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 739b992bde59..b0e571c8e3f3 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
{
switch (iter->type) {
case NFT_ITER_UPDATE:
+ case NFT_ITER_UPDATE_CLONE:
/* only relevant for netlink dumps which use READ type */
WARN_ON_ONCE(iter->skip != 0);
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 7ef4b44471d3..c091898df710 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2144,13 +2144,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
const struct nft_pipapo_match *m;
switch (iter->type) {
- case NFT_ITER_UPDATE:
+ case NFT_ITER_UPDATE_CLONE:
m = pipapo_maybe_clone(set);
if (!m) {
iter->err = -ENOMEM;
return;
}
-
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ break;
+ case NFT_ITER_UPDATE:
+ if (priv->clone)
+ m = priv->clone;
+ else
+ m = rcu_dereference_protected(priv->match,
+ nft_pipapo_transaction_mutex_held(set));
nft_pipapo_do_walk(ctx, set, m, iter);
break;
case NFT_ITER_READ:
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 3f02e4478216..ee3d4f5b9ff7 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -861,13 +861,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
struct nft_rbtree *priv = nft_set_priv(set);
switch (iter->type) {
- case NFT_ITER_UPDATE:
- lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
-
+ case NFT_ITER_UPDATE_CLONE:
if (nft_array_may_resize(set) < 0) {
iter->err = -ENOMEM;
break;
}
+ fallthrough;
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
nft_rbtree_do_walk(ctx, set, iter);
break;
case NFT_ITER_READ:
--
cgit v1.2.3
From 9df95785d3d8302f7c066050117b04cd3c2048c2 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 3 Mar 2026 16:31:32 +0100
Subject: netfilter: nft_set_pipapo: split gc into unlink and reclaim phase
Yiming Qian reports Use-after-free in the pipapo set type:
Under a large number of expired elements, commit-time GC can run for a very
long time in a non-preemptible context, triggering soft lockup warnings and
RCU stall reports (local denial of service).
We must split GC in an unlink and a reclaim phase.
We cannot queue elements for freeing until pointers have been swapped.
Expired elements are still exposed to both the packet path and userspace
dumpers via the live copy of the data structure.
call_rcu() does not protect us: dump operations or element lookups starting
after call_rcu has fired can still observe the free'd element, unless the
commit phase has made enough progress to swap the clone and live pointers
before any new reader has picked up the old version.
This a similar approach as done recently for the rbtree backend in commit
35f83a75529a ("netfilter: nft_set_rbtree: don't gc elements on insert").
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reported-by: Yiming Qian
Signed-off-by: Florian Westphal
---
include/net/netfilter/nf_tables.h | 5 ++++
net/netfilter/nf_tables_api.c | 5 ----
net/netfilter/nft_set_pipapo.c | 51 +++++++++++++++++++++++++++++++++------
net/netfilter/nft_set_pipapo.h | 2 ++
4 files changed, 50 insertions(+), 13 deletions(-)
(limited to 'include')
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ea6f29ad7888..e2d2bfc1f989 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1863,6 +1863,11 @@ struct nft_trans_gc {
struct rcu_head rcu;
};
+static inline int nft_trans_gc_space(const struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
static inline void nft_ctx_update(struct nft_ctx *ctx,
const struct nft_trans *trans)
{
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 058f7004cb2b..1862bd7fe804 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -10493,11 +10493,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
schedule_work(&trans_gc_work);
}
-static int nft_trans_gc_space(struct nft_trans_gc *trans)
-{
- return NFT_TRANS_GC_BATCHCOUNT - trans->count;
-}
-
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
unsigned int gc_seq, gfp_t gfp)
{
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index c091898df710..a34632ae6048 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1680,11 +1680,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
}
/**
- * pipapo_gc() - Drop expired entries from set, destroy start and end elements
+ * pipapo_gc_scan() - Drop expired entries from set and link them to gc list
* @set: nftables API set representation
* @m: Matching data
*/
-static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct net *net = read_pnet(&set->net);
@@ -1697,6 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
if (!gc)
return;
+ list_add(&gc->list, &priv->gc_head);
+
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const struct nft_pipapo_field *f;
@@ -1724,9 +1726,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
* NFT_SET_ELEM_DEAD_BIT.
*/
if (__nft_set_elem_expired(&e->ext, tstamp)) {
- gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
- if (!gc)
- return;
+ if (!nft_trans_gc_space(gc)) {
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ list_add(&gc->list, &priv->gc_head);
+ }
nft_pipapo_gc_deactivate(net, set, e);
pipapo_drop(m, rulemap);
@@ -1740,10 +1746,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
}
}
- gc = nft_trans_gc_catchall_sync(gc);
+ priv->last_gc = jiffies;
+}
+
+/**
+ * pipapo_gc_queue() - Free expired elements
+ * @set: nftables API set representation
+ */
+static void pipapo_gc_queue(struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_trans_gc *gc, *next;
+
+ /* always do a catchall cycle: */
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
if (gc) {
+ gc = nft_trans_gc_catchall_sync(gc);
+ if (gc)
+ nft_trans_gc_queue_sync_done(gc);
+ }
+
+ /* always purge queued gc elements. */
+ list_for_each_entry_safe(gc, next, &priv->gc_head, list) {
+ list_del(&gc->list);
nft_trans_gc_queue_sync_done(gc);
- priv->last_gc = jiffies;
}
}
@@ -1797,6 +1823,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
*
* We also need to create a new working copy for subsequent insertions and
* deletions.
+ *
+ * After the live copy has been replaced by the clone, we can safely queue
+ * expired elements that have been collected by pipapo_gc_scan() for
+ * memory reclaim.
*/
static void nft_pipapo_commit(struct nft_set *set)
{
@@ -1807,7 +1837,7 @@ static void nft_pipapo_commit(struct nft_set *set)
return;
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- pipapo_gc(set, priv->clone);
+ pipapo_gc_scan(set, priv->clone);
old = rcu_replace_pointer(priv->match, priv->clone,
nft_pipapo_transaction_mutex_held(set));
@@ -1815,6 +1845,8 @@ static void nft_pipapo_commit(struct nft_set *set)
if (old)
call_rcu(&old->rcu, pipapo_reclaim_match);
+
+ pipapo_gc_queue(set);
}
static void nft_pipapo_abort(const struct nft_set *set)
@@ -2279,6 +2311,7 @@ static int nft_pipapo_init(const struct nft_set *set,
f->mt = NULL;
}
+ INIT_LIST_HEAD(&priv->gc_head);
rcu_assign_pointer(priv->match, m);
return 0;
@@ -2328,6 +2361,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx,
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
+ WARN_ON_ONCE(!list_empty(&priv->gc_head));
+
m = rcu_dereference_protected(priv->match, true);
if (priv->clone) {
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index eaab422aa56a..9aee9a9eaeb7 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -156,12 +156,14 @@ struct nft_pipapo_match {
* @clone: Copy where pending insertions and deletions are kept
* @width: Total bytes to be matched for one packet, including padding
* @last_gc: Timestamp of last garbage collection run, jiffies
+ * @gc_head: list of nft_trans_gc to queue up for mem reclaim
*/
struct nft_pipapo {
struct nft_pipapo_match __rcu *match;
struct nft_pipapo_match *clone;
int width;
unsigned long last_gc;
+ struct list_head gc_head;
};
struct nft_pipapo_elem;
--
cgit v1.2.3
From e2cedd400c3ec0302ffca2490e8751772906ac23 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim
Date: Wed, 4 Mar 2026 09:06:02 -0500
Subject: net/sched: act_ife: Fix metalist update behavior
Whenever an ife action replace changes the metalist, instead of
replacing the old data on the metalist, the current ife code is appending
the new metadata. Aside from being innapropriate behavior, this may lead
to an unbounded addition of metadata to the metalist which might cause an
out of bounds error when running the encode op:
[ 138.423369][ C1] ==================================================================
[ 138.424317][ C1] BUG: KASAN: slab-out-of-bounds in ife_tlv_meta_encode (net/ife/ife.c:168)
[ 138.424906][ C1] Write of size 4 at addr ffff8880077f4ffe by task ife_out_out_bou/255
[ 138.425778][ C1] CPU: 1 UID: 0 PID: 255 Comm: ife_out_out_bou Not tainted 7.0.0-rc1-00169-gfbdfa8da05b6 #624 PREEMPT(full)
[ 138.425795][ C1] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 138.425800][ C1] Call Trace:
[ 138.425804][ C1]
[ 138.425808][ C1] dump_stack_lvl (lib/dump_stack.c:122)
[ 138.425828][ C1] print_report (mm/kasan/report.c:379 mm/kasan/report.c:482)
[ 138.425839][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[ 138.425844][ C1] ? __virt_addr_valid (./arch/x86/include/asm/preempt.h:95 (discriminator 1) ./include/linux/rcupdate.h:975 (discriminator 1) ./include/linux/mmzone.h:2207 (discriminator 1) arch/x86/mm/physaddr.c:54 (discriminator 1))
[ 138.425853][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168)
[ 138.425859][ C1] kasan_report (mm/kasan/report.c:221 mm/kasan/report.c:597)
[ 138.425868][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168)
[ 138.425878][ C1] kasan_check_range (mm/kasan/generic.c:186 (discriminator 1) mm/kasan/generic.c:200 (discriminator 1))
[ 138.425884][ C1] __asan_memset (mm/kasan/shadow.c:84 (discriminator 2))
[ 138.425889][ C1] ife_tlv_meta_encode (net/ife/ife.c:168)
[ 138.425893][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:171)
[ 138.425898][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[ 138.425903][ C1] ife_encode_meta_u16 (net/sched/act_ife.c:57)
[ 138.425910][ C1] ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114)
[ 138.425916][ C1] ? __asan_memcpy (mm/kasan/shadow.c:105 (discriminator 3))
[ 138.425921][ C1] ? __pfx_ife_encode_meta_u16 (net/sched/act_ife.c:45)
[ 138.425927][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221)
[ 138.425931][ C1] tcf_ife_act (net/sched/act_ife.c:847 net/sched/act_ife.c:879)
To solve this issue, fix the replace behavior by adding the metalist to
the ife rcu data structure.
Fixes: aa9fd9a325d51 ("sched: act: ife: update parameters via rcu handling")
Reported-by: Ruitong Liu
Tested-by: Ruitong Liu
Co-developed-by: Victor Nogueira
Signed-off-by: Victor Nogueira
Signed-off-by: Jamal Hadi Salim
Link: https://patch.msgid.link/20260304140603.76500-1-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski
---
include/net/tc_act/tc_ife.h | 4 +-
net/sched/act_ife.c | 93 +++++++++++++++++++++------------------------
2 files changed, 45 insertions(+), 52 deletions(-)
(limited to 'include')
diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index c7f24a2da1ca..24d4d5a62b3c 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -13,15 +13,13 @@ struct tcf_ife_params {
u8 eth_src[ETH_ALEN];
u16 eth_type;
u16 flags;
-
+ struct list_head metalist;
struct rcu_head rcu;
};
struct tcf_ife_info {
struct tc_action common;
struct tcf_ife_params __rcu *params;
- /* list of metaids allowed */
- struct list_head metalist;
};
#define to_ife(a) ((struct tcf_ife_info *)a)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 79df81d12894..d5e8a91bb4eb 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -293,8 +293,8 @@ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held)
/* called when adding new meta information
*/
static int __add_metainfo(const struct tcf_meta_ops *ops,
- struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len, bool atomic, bool exists)
+ struct tcf_ife_params *p, u32 metaid, void *metaval,
+ int len, bool atomic)
{
struct tcf_meta_info *mi = NULL;
int ret = 0;
@@ -313,45 +313,40 @@ static int __add_metainfo(const struct tcf_meta_ops *ops,
}
}
- if (exists)
- spin_lock_bh(&ife->tcf_lock);
- list_add_tail(&mi->metalist, &ife->metalist);
- if (exists)
- spin_unlock_bh(&ife->tcf_lock);
+ list_add_tail(&mi->metalist, &p->metalist);
return ret;
}
static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops,
- struct tcf_ife_info *ife, u32 metaid,
- bool exists)
+ struct tcf_ife_params *p, u32 metaid)
{
int ret;
if (!try_module_get(ops->owner))
return -ENOENT;
- ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists);
+ ret = __add_metainfo(ops, p, metaid, NULL, 0, true);
if (ret)
module_put(ops->owner);
return ret;
}
-static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len, bool exists)
+static int add_metainfo(struct tcf_ife_params *p, u32 metaid, void *metaval,
+ int len)
{
const struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret;
if (!ops)
return -ENOENT;
- ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists);
+ ret = __add_metainfo(ops, p, metaid, metaval, len, false);
if (ret)
/*put back what find_ife_oplist took */
module_put(ops->owner);
return ret;
}
-static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
+static int use_all_metadata(struct tcf_ife_params *p)
{
struct tcf_meta_ops *o;
int rc = 0;
@@ -359,7 +354,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
read_lock(&ife_mod_lock);
list_for_each_entry(o, &ifeoplist, list) {
- rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists);
+ rc = add_metainfo_and_get_ops(o, p, o->metaid);
if (rc == 0)
installed += 1;
}
@@ -371,7 +366,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
return -EINVAL;
}
-static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_params *p)
{
struct tcf_meta_info *e;
struct nlattr *nest;
@@ -379,14 +374,14 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
int total_encoded = 0;
/*can only happen on decode */
- if (list_empty(&ife->metalist))
+ if (list_empty(&p->metalist))
return 0;
nest = nla_nest_start_noflag(skb, TCA_IFE_METALST);
if (!nest)
goto out_nlmsg_trim;
- list_for_each_entry(e, &ife->metalist, metalist) {
+ list_for_each_entry(e, &p->metalist, metalist) {
if (!e->ops->get(skb, e))
total_encoded += 1;
}
@@ -403,13 +398,11 @@ out_nlmsg_trim:
return -1;
}
-/* under ife->tcf_lock */
-static void _tcf_ife_cleanup(struct tc_action *a)
+static void __tcf_ife_cleanup(struct tcf_ife_params *p)
{
- struct tcf_ife_info *ife = to_ife(a);
struct tcf_meta_info *e, *n;
- list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ list_for_each_entry_safe(e, n, &p->metalist, metalist) {
list_del(&e->metalist);
if (e->metaval) {
if (e->ops->release)
@@ -422,18 +415,23 @@ static void _tcf_ife_cleanup(struct tc_action *a)
}
}
+static void tcf_ife_cleanup_params(struct rcu_head *head)
+{
+ struct tcf_ife_params *p = container_of(head, struct tcf_ife_params,
+ rcu);
+
+ __tcf_ife_cleanup(p);
+ kfree(p);
+}
+
static void tcf_ife_cleanup(struct tc_action *a)
{
struct tcf_ife_info *ife = to_ife(a);
struct tcf_ife_params *p;
- spin_lock_bh(&ife->tcf_lock);
- _tcf_ife_cleanup(a);
- spin_unlock_bh(&ife->tcf_lock);
-
p = rcu_dereference_protected(ife->params, 1);
if (p)
- kfree_rcu(p, rcu);
+ call_rcu(&p->rcu, tcf_ife_cleanup_params);
}
static int load_metalist(struct nlattr **tb, bool rtnl_held)
@@ -455,8 +453,7 @@ static int load_metalist(struct nlattr **tb, bool rtnl_held)
return 0;
}
-static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
- bool exists, bool rtnl_held)
+static int populate_metalist(struct tcf_ife_params *p, struct nlattr **tb)
{
int len = 0;
int rc = 0;
@@ -468,7 +465,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = add_metainfo(ife, i, val, len, exists);
+ rc = add_metainfo(p, i, val, len);
if (rc)
return rc;
}
@@ -523,6 +520,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
p = kzalloc_obj(*p);
if (!p)
return -ENOMEM;
+ INIT_LIST_HEAD(&p->metalist);
if (tb[TCA_IFE_METALST]) {
err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
@@ -567,8 +565,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
}
ife = to_ife(*a);
- if (ret == ACT_P_CREATED)
- INIT_LIST_HEAD(&ife->metalist);
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
if (err < 0)
@@ -600,8 +596,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
}
if (tb[TCA_IFE_METALST]) {
- err = populate_metalist(ife, tb2, exists,
- !(flags & TCA_ACT_FLAGS_NO_RTNL));
+ err = populate_metalist(p, tb2);
if (err)
goto metadata_parse_err;
} else {
@@ -610,7 +605,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
* as we can. You better have at least one else we are
* going to bail out
*/
- err = use_all_metadata(ife, exists);
+ err = use_all_metadata(p);
if (err)
goto metadata_parse_err;
}
@@ -626,13 +621,14 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
if (p)
- kfree_rcu(p, rcu);
+ call_rcu(&p->rcu, tcf_ife_cleanup_params);
return ret;
metadata_parse_err:
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
release_idr:
+ __tcf_ife_cleanup(p);
kfree(p);
tcf_idr_release(*a, bind);
return err;
@@ -679,7 +675,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type))
goto nla_put_failure;
- if (dump_metalist(skb, ife)) {
+ if (dump_metalist(skb, p)) {
/*ignore failure to dump metalist */
pr_info("Failed to dump metalist\n");
}
@@ -693,13 +689,13 @@ nla_put_failure:
return -1;
}
-static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_params *p,
u16 metaid, u16 mlen, void *mdata)
{
struct tcf_meta_info *e;
/* XXX: use hash to speed up */
- list_for_each_entry(e, &ife->metalist, metalist) {
+ list_for_each_entry_rcu(e, &p->metalist, metalist) {
if (metaid == e->metaid) {
if (e->ops) {
/* We check for decode presence already */
@@ -716,10 +712,13 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
{
struct tcf_ife_info *ife = to_ife(a);
int action = ife->tcf_action;
+ struct tcf_ife_params *p;
u8 *ifehdr_end;
u8 *tlv_data;
u16 metalen;
+ p = rcu_dereference_bh(ife->params);
+
bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb);
tcf_lastuse_update(&ife->tcf_tm);
@@ -745,7 +744,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
return TC_ACT_SHOT;
}
- if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
+ if (find_decode_metaid(skb, p, mtype, dlen, curr_data)) {
/* abuse overlimits to count when we receive metadata
* but dont have an ops for it
*/
@@ -769,12 +768,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
/*XXX: check if we can do this at install time instead of current
* send data path
**/
-static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_params *p)
{
- struct tcf_meta_info *e, *n;
+ struct tcf_meta_info *e;
int tot_run_sz = 0, run_sz = 0;
- list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ list_for_each_entry_rcu(e, &p->metalist, metalist) {
if (e->ops->check_presence) {
run_sz = e->ops->check_presence(skb, e);
tot_run_sz += run_sz;
@@ -795,7 +794,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
where ORIGDATA = original ethernet header ...
*/
- u16 metalen = ife_get_sz(skb, ife);
+ u16 metalen = ife_get_sz(skb, p);
int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
unsigned int skboff = 0;
int new_len = skb->len + hdrm;
@@ -833,25 +832,21 @@ drop:
if (!ife_meta)
goto drop;
- spin_lock(&ife->tcf_lock);
-
/* XXX: we dont have a clever way of telling encode to
* not repeat some of the computations that are done by
* ops->presence_check...
*/
- list_for_each_entry(e, &ife->metalist, metalist) {
+ list_for_each_entry_rcu(e, &p->metalist, metalist) {
if (e->ops->encode) {
err = e->ops->encode(skb, (void *)(ife_meta + skboff),
e);
}
if (err < 0) {
/* too corrupt to keep around if overwritten */
- spin_unlock(&ife->tcf_lock);
goto drop;
}
skboff += err;
}
- spin_unlock(&ife->tcf_lock);
oethh = (struct ethhdr *)skb->data;
if (!is_zero_ether_addr(p->eth_src))
--
cgit v1.2.3
From 16394d80539937d348dd3b9ea32415c54e67a81b Mon Sep 17 00:00:00 2001
From: Larysa Zaremba
Date: Thu, 5 Mar 2026 12:12:43 +0100
Subject: xsk: introduce helper to determine rxq->frag_size
rxq->frag_size is basically a step between consecutive strictly aligned
frames. In ZC mode, chunk size fits exactly, but if chunks are unaligned,
there is no safe way to determine accessible space to grow tailroom.
Report frag_size to be zero, if chunks are unaligned, chunk_size otherwise.
Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Reviewed-by: Aleksandr Loktionov
Signed-off-by: Larysa Zaremba
Link: https://patch.msgid.link/20260305111253.2317394-3-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski
---
include/net/xdp_sock_drv.h | 10 ++++++++++
1 file changed, 10 insertions(+)
(limited to 'include')
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index aefc368449d5..6b9ebae2dc95 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -51,6 +51,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool);
}
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+ return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool);
+}
+
static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
@@ -337,6 +342,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
return 0;
}
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+ return 0;
+}
+
static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
--
cgit v1.2.3
From 75d9228982f23d68066ca0b7d87014c3eb8ddc85 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba
Date: Thu, 5 Mar 2026 12:12:48 +0100
Subject: libeth, idpf: use truesize as XDP RxQ info frag_size
The only user of frag_size field in XDP RxQ info is
bpf_xdp_frags_increase_tail(). It clearly expects whole buffer size instead
of DMA write size. Different assumptions in idpf driver configuration lead
to negative tailroom.
To make it worse, buffer sizes are not actually uniform in idpf when
splitq is enabled, as there are several buffer queues, so rxq->rx_buf_size
is meaningless in this case.
Use truesize of the first bufq in AF_XDP ZC, as there is only one. Disable
growing tail for regular splitq.
Fixes: ac8a861f632e ("idpf: prepare structures to support XDP")
Reviewed-by: Aleksandr Loktionov
Signed-off-by: Larysa Zaremba
Link: https://patch.msgid.link/20260305111253.2317394-8-larysa.zaremba@intel.com
Signed-off-by: Jakub Kicinski
---
drivers/net/ethernet/intel/idpf/xdp.c | 6 +++++-
drivers/net/ethernet/intel/idpf/xsk.c | 1 +
drivers/net/ethernet/intel/libeth/xsk.c | 1 +
include/net/libeth/xsk.h | 3 +++
4 files changed, 10 insertions(+), 1 deletion(-)
(limited to 'include')
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c
index 6ac9c6624c2a..cbccd4546768 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -47,12 +47,16 @@ static int __idpf_xdp_rxq_info_init(struct idpf_rx_queue *rxq, void *arg)
{
const struct idpf_vport *vport = rxq->q_vector->vport;
const struct idpf_q_vec_rsrc *rsrc;
+ u32 frag_size = 0;
bool split;
int err;
+ if (idpf_queue_has(XSK, rxq))
+ frag_size = rxq->bufq_sets[0].bufq.truesize;
+
err = __xdp_rxq_info_reg(&rxq->xdp_rxq, vport->netdev, rxq->idx,
rxq->q_vector->napi.napi_id,
- rxq->rx_buf_size);
+ frag_size);
if (err)
return err;
diff --git a/drivers/net/ethernet/intel/idpf/xsk.c b/drivers/net/ethernet/intel/idpf/xsk.c
index 676cbd80774d..d95d3efdfd36 100644
--- a/drivers/net/ethernet/intel/idpf/xsk.c
+++ b/drivers/net/ethernet/intel/idpf/xsk.c
@@ -403,6 +403,7 @@ int idpf_xskfq_init(struct idpf_buf_queue *bufq)
bufq->pending = fq.pending;
bufq->thresh = fq.thresh;
bufq->rx_buf_size = fq.buf_len;
+ bufq->truesize = fq.truesize;
if (!idpf_xskfq_refill(bufq))
netdev_err(bufq->pool->netdev,
diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
index 846e902e31b6..4882951d5c9c 100644
--- a/drivers/net/ethernet/intel/libeth/xsk.c
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -167,6 +167,7 @@ int libeth_xskfq_create(struct libeth_xskfq *fq)
fq->pending = fq->count;
fq->thresh = libeth_xdp_queue_threshold(fq->count);
fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool);
+ fq->truesize = xsk_pool_get_rx_frag_step(fq->pool);
return 0;
}
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index 481a7b28e6f2..82b5d21aae87 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -597,6 +597,7 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
* @pending: current number of XSkFQEs to refill
* @thresh: threshold below which the queue is refilled
* @buf_len: HW-writeable length per each buffer
+ * @truesize: step between consecutive buffers, 0 if none exists
* @nid: ID of the closest NUMA node with memory
*/
struct libeth_xskfq {
@@ -614,6 +615,8 @@ struct libeth_xskfq {
u32 thresh;
u32 buf_len;
+ u32 truesize;
+
int nid;
};
--
cgit v1.2.3