From aa8a3f3c67235422a0c3608a8772f69ca3b7b63f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 24 Feb 2026 00:05:11 +0100 Subject: xfrm: add missing extack for XFRMA_SA_PCPU in add_acquire and allocspi We're returning an error caused by invalid user input without setting an extack. Add one. Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 403b5ecac2c5..3e6477c6082e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1850,6 +1850,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); if (pcpu_num >= num_possible_cpus()) { err = -EINVAL; + NL_SET_ERR_MSG(extack, "pCPU number too big"); goto out_noput; } } @@ -3001,8 +3002,10 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_SA_PCPU]) { x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); err = -EINVAL; - if (x->pcpu_num >= num_possible_cpus()) + if (x->pcpu_num >= num_possible_cpus()) { + NL_SET_ERR_MSG(extack, "pCPU number too big"); goto free_state; + } } err = verify_newpolicy_info(&ua->policy, extack); -- cgit v1.2.3 From b57defcf8f109da5ba9cf59b2a736606faf3d846 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 24 Feb 2026 00:05:12 +0100 Subject: xfrm: fix the condition on x->pcpu_num in xfrm_sa_len pcpu_num = 0 is a valid value. The marker for "unset pcpu_num" which makes copy_to_user_state_extra not add the XFRMA_SA_PCPU attribute is UINT_MAX. Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 3e6477c6082e..4dd8341225bc 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3676,7 +3676,7 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) } if (x->if_id) l += nla_total_size(sizeof(x->if_id)); - if (x->pcpu_num) + if (x->pcpu_num != UINT_MAX) l += nla_total_size(sizeof(x->pcpu_num)); /* Must count x->lastused as it may become non-zero behind our back. */ -- cgit v1.2.3 From 7d2fc41f91bc69acb6e01b0fa23cd7d0109a6a23 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 24 Feb 2026 00:05:13 +0100 Subject: xfrm: call xdo_dev_state_delete during state update When we update an SA, we construct a new state and call xdo_dev_state_add, but never insert it. The existing state is updated, then we immediately destroy the new state. Since we haven't added it, we don't go through the standard state delete code, and we're skipping removing it from the device (but xdo_dev_state_free will get called when we destroy the temporary state). This is similar to commit c5d4d7d83165 ("xfrm: Fix deletion of offloaded SAs on failure."). Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 98b362d51836..a00c4fe1ab0c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2264,6 +2264,7 @@ out: err = 0; x->km.state = XFRM_STATE_DEAD; + xfrm_dev_state_delete(x); __xfrm_state_put(x); } -- cgit v1.2.3 From 0c0eef8ccd2413b0a10eb6bbd3442333b1e64dd2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 24 Feb 2026 00:05:14 +0100 Subject: esp: fix skb leak with espintcp and async crypto When the TX queue for espintcp is full, esp_output_tail_tcp will return an error and not free the skb, because with synchronous crypto, the common xfrm output code will drop the packet for us. With async crypto (esp_output_done), we need to drop the skb when esp_output_tail_tcp returns an error. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 9 ++++++--- net/ipv6/esp6.c | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 2c922afadb8f..6dfc0bcdef65 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -235,10 +235,13 @@ static void esp_output_done(void *data, int err) xfrm_dev_resume(skb); } else { if (!err && - x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) - esp_output_tail_tcp(x, skb); - else + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) { + err = esp_output_tail_tcp(x, skb); + if (err != -EINPROGRESS) + kfree_skb(skb); + } else { xfrm_output_resume(skb_to_full_sk(skb), skb, err); + } } } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index e75da98f5283..9f75313734f8 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -271,10 +271,13 @@ static void esp_output_done(void *data, int err) xfrm_dev_resume(skb); } else { if (!err && - x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) - esp_output_tail_tcp(x, skb); - else + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) { + err = esp_output_tail_tcp(x, skb); + if (err != -EINPROGRESS) + kfree_skb(skb); + } else { xfrm_output_resume(skb_to_full_sk(skb), skb, err); + } } } -- cgit v1.2.3 From 0d10393d5eac33cbd92f7a41fddca12c41d3cb7e Mon Sep 17 00:00:00 2001 From: Roshan Kumar Date: Sun, 1 Mar 2026 10:56:38 +0000 Subject: xfrm: iptfs: validate inner IPv4 header length in IPTFS payload Add validation of the inner IPv4 packet tot_len and ihl fields parsed from decrypted IPTFS payloads in __input_process_payload(). A crafted ESP packet containing an inner IPv4 header with tot_len=0 causes an infinite loop: iplen=0 leads to capturelen=min(0, remaining)=0, so the data offset never advances and the while(data < tail) loop never terminates, spinning forever in softirq context. Reject inner IPv4 packets where tot_len < ihl*4 or ihl*4 < sizeof(struct iphdr), which catches both the tot_len=0 case and malformed ihl values. The normal IP stack performs this validation in ip_rcv_core(), but IPTFS extracts and processes inner packets before they reach that layer. Reported-by: Roshan Kumar Fixes: 6c82d2433671 ("xfrm: iptfs: add basic receive packet (tunnel egress) handling") Cc: stable@vger.kernel.org Signed-off-by: Roshan Kumar Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 3b6d7284fc70..0747d1cfa333 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -991,6 +991,11 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, iplen = be16_to_cpu(iph->tot_len); iphlen = iph->ihl << 2; + if (iplen < iphlen || iphlen < sizeof(*iph)) { + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINHDRERROR); + goto done; + } protocol = cpu_to_be16(ETH_P_IP); XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos; } else if (iph->version == 0x6) { -- cgit v1.2.3 From 0b352f83cabfefdaafa806d6471f0eca117dc7d5 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 4 Mar 2026 15:09:35 +0100 Subject: xfrm: iptfs: fix skb_put() panic on non-linear skb during reassembly In iptfs_reassem_cont(), IP-TFS attempts to append data to the new inner packet 'newskb' that is being reassembled. First a zero-copy approach is tried if it succeeds then newskb becomes non-linear. When a subsequent fragment in the same datagram does not meet the fast-path conditions, a memory copy is performed. It calls skb_put() to append the data and as newskb is non-linear it triggers SKB_LINEAR_ASSERT check. Oops: invalid opcode: 0000 [#1] SMP NOPTI [...] RIP: 0010:skb_put+0x3c/0x40 [...] Call Trace: iptfs_reassem_cont+0x1ab/0x5e0 [xfrm_iptfs] iptfs_input_ordered+0x2af/0x380 [xfrm_iptfs] iptfs_input+0x122/0x3e0 [xfrm_iptfs] xfrm_input+0x91e/0x1a50 xfrm4_esp_rcv+0x3a/0x110 ip_protocol_deliver_rcu+0x1d7/0x1f0 ip_local_deliver_finish+0xbe/0x1e0 __netif_receive_skb_core.constprop.0+0xb56/0x1120 __netif_receive_skb_list_core+0x133/0x2b0 netif_receive_skb_list_internal+0x1ff/0x3f0 napi_complete_done+0x81/0x220 virtnet_poll+0x9d6/0x116e [virtio_net] __napi_poll.constprop.0+0x2b/0x270 net_rx_action+0x162/0x360 handle_softirqs+0xdc/0x510 __irq_exit_rcu+0xe7/0x110 irq_exit_rcu+0xe/0x20 common_interrupt+0x85/0xa0 Fix this by checking if the skb is non-linear. If it is, linearize it by calling skb_linearize(). As the initial allocation of newskb originally reserved enough tailroom for the entire reassembled packet we do not need to check if we have enough tailroom or extend it. Fixes: 5f2b6a909574 ("xfrm: iptfs: add skb-fragment sharing code") Reported-by: Hao Long Closes: https://lore.kernel.org/netdev/DGRCO9SL0T5U.JTINSHJQ9KPK@imlonghao.com/ Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 0747d1cfa333..2c87290fe06c 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -901,6 +901,12 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) { iptfs_skb_add_frags(newskb, fragwalk, data, copylen); } else { + if (skb_linearize(newskb)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + /* copy fragment data into newskb */ if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), copylen)) { -- cgit v1.2.3 From 9f455aac17db0aa1486c94dd2c231353ebc9d8bc Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:34 +0100 Subject: xfrm: state: fix sparse warnings on xfrm_state_hold_rcu In all callers, x is not an __rcu pointer. We can drop the annotation to avoid sparse warnings: net/xfrm/xfrm_state.c:58:39: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_state.c:58:39: expected struct refcount_struct [usertype] *r net/xfrm/xfrm_state.c:58:39: got struct refcount_struct [noderef] __rcu * net/xfrm/xfrm_state.c:1166:42: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_state.c:1166:42: expected struct xfrm_state [noderef] __rcu *x net/xfrm/xfrm_state.c:1166:42: got struct xfrm_state *[assigned] x (repeated for each caller) Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a00c4fe1ab0c..ad32085267a5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -53,7 +53,7 @@ static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); static HLIST_HEAD(xfrm_state_gc_list); static HLIST_HEAD(xfrm_state_dev_gc_list); -static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) +static inline bool xfrm_state_hold_rcu(struct xfrm_state *x) { return refcount_inc_not_zero(&x->refcnt); } -- cgit v1.2.3 From e2f845f672782b2522062cf1c9aad774276250d7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:35 +0100 Subject: xfrm: state: fix sparse warnings in xfrm_state_init Use rcu_assign_pointer, and tmp variables for freeing on the error path without accessing net->xfrm.state_by*. Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ad32085267a5..b81303cccc5e 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3259,6 +3259,7 @@ EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) { + struct hlist_head *ndst, *nsrc, *nspi, *nseq; unsigned int sz; if (net_eq(net, &init_net)) @@ -3269,18 +3270,25 @@ int __net_init xfrm_state_init(struct net *net) sz = sizeof(struct hlist_head) * 8; - net->xfrm.state_bydst = xfrm_hash_alloc(sz); - if (!net->xfrm.state_bydst) + ndst = xfrm_hash_alloc(sz); + if (!ndst) goto out_bydst; - net->xfrm.state_bysrc = xfrm_hash_alloc(sz); - if (!net->xfrm.state_bysrc) + rcu_assign_pointer(net->xfrm.state_bydst, ndst); + + nsrc = xfrm_hash_alloc(sz); + if (!nsrc) goto out_bysrc; - net->xfrm.state_byspi = xfrm_hash_alloc(sz); - if (!net->xfrm.state_byspi) + rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); + + nspi = xfrm_hash_alloc(sz); + if (!nspi) goto out_byspi; - net->xfrm.state_byseq = xfrm_hash_alloc(sz); - if (!net->xfrm.state_byseq) + rcu_assign_pointer(net->xfrm.state_byspi, nspi); + + nseq = xfrm_hash_alloc(sz); + if (!nseq) goto out_byseq; + rcu_assign_pointer(net->xfrm.state_byseq, nseq); net->xfrm.state_cache_input = alloc_percpu(struct hlist_head); if (!net->xfrm.state_cache_input) @@ -3296,13 +3304,13 @@ int __net_init xfrm_state_init(struct net *net) return 0; out_state_cache_input: - xfrm_hash_free(net->xfrm.state_byseq, sz); + xfrm_hash_free(nseq, sz); out_byseq: - xfrm_hash_free(net->xfrm.state_byspi, sz); + xfrm_hash_free(nspi, sz); out_byspi: - xfrm_hash_free(net->xfrm.state_bysrc, sz); + xfrm_hash_free(nsrc, sz); out_bysrc: - xfrm_hash_free(net->xfrm.state_bydst, sz); + xfrm_hash_free(ndst, sz); out_bydst: return -ENOMEM; } -- cgit v1.2.3 From 55b5bc03148b26ce8156bc47b637a7337aa7d257 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:36 +0100 Subject: xfrm: state: fix sparse warnings around XFRM_STATE_INSERT We're under xfrm_state_lock in all those cases, use xfrm_state_deref_prot(state_by*) to avoid sparse warnings: net/xfrm/xfrm_state.c:2597:25: warning: cast removes address space '__rcu' of expression net/xfrm/xfrm_state.c:2597:25: warning: incorrect type in argument 2 (different address spaces) net/xfrm/xfrm_state.c:2597:25: expected struct hlist_head *h net/xfrm/xfrm_state.c:2597:25: got struct hlist_head [noderef] __rcu * Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b81303cccc5e..34cf9f361683 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1563,23 +1563,23 @@ found: list_add(&x->km.all, &net->xfrm.state_all); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); XFRM_STATE_INSERT(bydst, &x->bydst, - net->xfrm.state_bydst + h, + xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); XFRM_STATE_INSERT(bysrc, &x->bysrc, - net->xfrm.state_bysrc + h, + xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h, x->xso.type); INIT_HLIST_NODE(&x->state_cache); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, - net->xfrm.state_byspi + h, + xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); XFRM_STATE_INSERT(byseq, &x->byseq, - net->xfrm.state_byseq + h, + xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h, x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; @@ -1730,25 +1730,29 @@ static void __xfrm_state_insert(struct xfrm_state *x) h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + XFRM_STATE_INSERT(bydst, &x->bydst, + xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, x->xso.type); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + XFRM_STATE_INSERT(bysrc, &x->bysrc, + xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h, x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + XFRM_STATE_INSERT(byspi, &x->byspi, + xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, + XFRM_STATE_INSERT(byseq, &x->byseq, + xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h, x->xso.type); } @@ -1868,10 +1872,12 @@ static struct xfrm_state *__find_acq_core(struct net *net, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); - XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + XFRM_STATE_INSERT(bydst, &x->bydst, + xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); - XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + XFRM_STATE_INSERT(bysrc, &x->bysrc, + xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h, x->xso.type); net->xfrm.state_num++; @@ -2603,7 +2609,9 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, if (!x0) { x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family); - XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); + XFRM_STATE_INSERT(byspi, &x->byspi, + xfrm_state_deref_prot(net->xfrm.state_byspi, net) + h, + x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; goto unlock; -- cgit v1.2.3 From 33cefb76a8edee8af257abfe6f42fb987c77132f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:37 +0100 Subject: xfrm: state: add xfrm_state_deref_prot to state_by* walk under lock We're under xfrm_state_lock for all those walks, we can use xfrm_state_deref_prot to silence sparse warnings such as: net/xfrm/xfrm_state.c:933:17: warning: dereference of noderef expression Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 34cf9f361683..27192b11be43 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -870,7 +870,7 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) { if (xfrm_id_proto_match(x->id.proto, proto) && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, task_valid); @@ -891,7 +891,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool struct xfrm_state *x; struct xfrm_dev_offload *xso; - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) { xso = &x->xso; if (xso->dev == dev && @@ -931,7 +931,7 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; restart: - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) { if (!xfrm_state_kern(x) && xfrm_id_proto_match(x->id.proto, proto)) { xfrm_state_hold(x); @@ -973,7 +973,7 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { restart: - hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) { xso = &x->xso; if (!xfrm_state_kern(x) && xso->dev == dev) { @@ -1652,7 +1652,7 @@ xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, reqid, family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) { if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && @@ -1779,7 +1779,7 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) u32 cpu_id = xnew->pcpu_num; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && @@ -1815,7 +1815,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_state *x; u32 mark = m->v & m->m; - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) { if (x->props.reqid != reqid || x->props.mode != mode || x->props.family != family || @@ -2097,7 +2097,7 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (m->reqid) { h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr, m->reqid, m->old_family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + h, bydst) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; @@ -2116,7 +2116,7 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n } else { h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr, m->old_family); - hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bysrc, net) + h, bysrc) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; @@ -2319,7 +2319,7 @@ void xfrm_state_update_stats(struct net *net) spin_lock_bh(&net->xfrm.xfrm_state_lock); for (i = 0; i <= net->xfrm.state_hmask; i++) { - hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst) + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_bydst, net) + i, bydst) xfrm_dev_state_update_stats(x); } spin_unlock_bh(&net->xfrm.xfrm_state_lock); @@ -2510,7 +2510,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_byseq, net) + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && x->pcpu_num == pcpu_num && -- cgit v1.2.3 From f468fdd52b97a63c4fb916fb882b936d8b43b8ae Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:38 +0100 Subject: xfrm: remove rcu/state_hold from xfrm_state_lookup_spi_proto xfrm_state_lookup_spi_proto is called under xfrm_state_lock by xfrm_alloc_spi, no need to take a reference on the state and pretend to be under RCU. Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 27192b11be43..f28cbe249c05 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1703,18 +1703,12 @@ static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 sp struct xfrm_state *x; unsigned int i; - rcu_read_lock(); for (i = 0; i <= net->xfrm.state_hmask; i++) { - hlist_for_each_entry_rcu(x, &net->xfrm.state_byspi[i], byspi) { - if (x->id.spi == spi && x->id.proto == proto) { - if (!xfrm_state_hold_rcu(x)) - continue; - rcu_read_unlock(); + hlist_for_each_entry(x, xfrm_state_deref_prot(net->xfrm.state_byspi, net) + i, byspi) { + if (x->id.spi == spi && x->id.proto == proto) return x; - } } } - rcu_read_unlock(); return NULL; } @@ -2616,7 +2610,6 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, err = 0; goto unlock; } - xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); next: -- cgit v1.2.3 From 05b8673963c492fe36533e99a4a3c6661ca09ed0 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:39 +0100 Subject: xfrm: state: silence sparse warnings during netns exit Silence sparse warnings in xfrm_state_fini: net/xfrm/xfrm_state.c:3327:9: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_state.c:3327:9: expected struct hlist_head const *h net/xfrm/xfrm_state.c:3327:9: got struct hlist_head [noderef] __rcu *state_byseq Add xfrm_state_deref_netexit() to wrap those calls. The netns is going away, we don't have to worry about the state_by* pointers being changed behind our backs. Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f28cbe249c05..1748d374abca 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3316,6 +3316,8 @@ out_bydst: return -ENOMEM; } +#define xfrm_state_deref_netexit(table) \ + rcu_dereference_protected((table), true /* netns is going away */) void xfrm_state_fini(struct net *net) { unsigned int sz; @@ -3328,17 +3330,17 @@ void xfrm_state_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.state_all)); for (i = 0; i <= net->xfrm.state_hmask; i++) { - WARN_ON(!hlist_empty(net->xfrm.state_byseq + i)); - WARN_ON(!hlist_empty(net->xfrm.state_byspi + i)); - WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i)); - WARN_ON(!hlist_empty(net->xfrm.state_bydst + i)); + WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_byseq) + i)); + WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_byspi) + i)); + WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_bysrc) + i)); + WARN_ON(!hlist_empty(xfrm_state_deref_netexit(net->xfrm.state_bydst) + i)); } sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); - xfrm_hash_free(net->xfrm.state_byseq, sz); - xfrm_hash_free(net->xfrm.state_byspi, sz); - xfrm_hash_free(net->xfrm.state_bysrc, sz); - xfrm_hash_free(net->xfrm.state_bydst, sz); + xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_byseq), sz); + xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_byspi), sz); + xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_bysrc), sz); + xfrm_hash_free(xfrm_state_deref_netexit(net->xfrm.state_bydst), sz); free_percpu(net->xfrm.state_cache_input); } -- cgit v1.2.3 From b1f9c67781efd8a0ebd5019f14fbbac981cff7c1 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:40 +0100 Subject: xfrm: policy: fix sparse warnings in xfrm_policy_{init,fini} In xfrm_policy_init: add rcu_assign_pointer to fix warning: net/xfrm/xfrm_policy.c:4238:29: warning: incorrect type in assignment (different address spaces) net/xfrm/xfrm_policy.c:4238:29: expected struct hlist_head [noderef] __rcu *table net/xfrm/xfrm_policy.c:4238:29: got struct hlist_head * add rcu_dereference_protected to silence warning: net/xfrm/xfrm_policy.c:4265:36: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_policy.c:4265:36: expected struct hlist_head *n net/xfrm/xfrm_policy.c:4265:36: got struct hlist_head [noderef] __rcu *table The netns is being created, no concurrent access is possible yet. In xfrm_policy_fini, net is going away, there shouldn't be any concurrent changes to the hashtables, so we can use rcu_dereference_protected to silence warnings: net/xfrm/xfrm_policy.c:4291:17: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_policy.c:4291:17: expected struct hlist_head const *h net/xfrm/xfrm_policy.c:4291:17: got struct hlist_head [noderef] __rcu *table net/xfrm/xfrm_policy.c:4292:36: warning: incorrect type in argument 1 (different address spaces) net/xfrm/xfrm_policy.c:4292:36: expected struct hlist_head *n net/xfrm/xfrm_policy.c:4292:36: got struct hlist_head [noderef] __rcu *table Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5428185196a1..49de5a6f4b85 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4242,7 +4242,7 @@ static int __net_init xfrm_policy_init(struct net *net) net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; htab = &net->xfrm.policy_bydst[dir]; - htab->table = xfrm_hash_alloc(sz); + rcu_assign_pointer(htab->table, xfrm_hash_alloc(sz)); if (!htab->table) goto out_bydst; htab->hmask = hmask; @@ -4269,7 +4269,7 @@ out_bydst: struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; - xfrm_hash_free(htab->table, sz); + xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: @@ -4295,8 +4295,8 @@ static void xfrm_policy_fini(struct net *net) htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); - WARN_ON(!hlist_empty(htab->table)); - xfrm_hash_free(htab->table, sz); + WARN_ON(!hlist_empty(rcu_dereference_protected(htab->table, true))); + xfrm_hash_free(rcu_dereference_protected(htab->table, true), sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); -- cgit v1.2.3 From 2da6901866e7137f4e1a51a5f0bd1fbd0848a4eb Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:41 +0100 Subject: xfrm: policy: silence sparse warning in xfrm_policy_unregister_afinfo xfrm_policy_afinfo is __rcu, use rcu_access_pointer to silence: net/xfrm/xfrm_policy.c:4152:43: error: incompatible types in comparison expression (different address spaces): net/xfrm/xfrm_policy.c:4152:43: struct xfrm_policy_afinfo const [noderef] __rcu * net/xfrm/xfrm_policy.c:4152:43: struct xfrm_policy_afinfo const * Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 49de5a6f4b85..66ac93e65264 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4156,7 +4156,7 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { - if (xfrm_policy_afinfo[i] != afinfo) + if (rcu_access_pointer(xfrm_policy_afinfo[i]) != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; -- cgit v1.2.3 From 103b4f5b4007cb484f40b1c8095a7e0526e5aff6 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:42 +0100 Subject: xfrm: add rcu_access_pointer to silence sparse warning for xfrm_input_afinfo xfrm_input_afinfo is __rcu, we should use rcu_access_pointer to avoid a sparse warning: net/xfrm/xfrm_input.c:78:21: error: incompatible types in comparison expression (different address spaces): net/xfrm/xfrm_input.c:78:21: struct xfrm_input_afinfo const [noderef] __rcu * net/xfrm/xfrm_input.c:78:21: struct xfrm_input_afinfo const * Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 4ed346e682c7..dc1312ed5a09 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -75,7 +75,10 @@ int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) spin_lock_bh(&xfrm_input_afinfo_lock); if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { - if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) + const struct xfrm_input_afinfo *cur; + + cur = rcu_access_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family]); + if (unlikely(cur != afinfo)) err = -EINVAL; else RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); -- cgit v1.2.3 From d87f8bc47fbf012a7f115e311d0603d97e47c34c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 9 Mar 2026 11:32:43 +0100 Subject: xfrm: avoid RCU warnings around the per-netns netlink socket net->xfrm.nlsk is used in 2 types of contexts: - fully under RCU, with rcu_read_lock + rcu_dereference and a NULL check - in the netlink handlers, with requests coming from a userspace socket In the 2nd case, net->xfrm.nlsk is guaranteed to stay non-NULL and the object is alive, since we can't enter the netns destruction path while the user socket holds a reference on the netns. After adding the __rcu annotation to netns_xfrm.nlsk (which silences sparse warnings in the RCU users and __net_init code), we need to tell sparse that the 2nd case is safe. Add a helper for that. Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- net/xfrm/xfrm_user.c | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 23dd647fe024..b73983a17e08 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -59,7 +59,7 @@ struct netns_xfrm { struct list_head inexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4dd8341225bc..1656b487f833 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -35,6 +35,15 @@ #endif #include +static struct sock *xfrm_net_nlsk(const struct net *net, const struct sk_buff *skb) +{ + /* get the source of this request, see netlink_unicast_kernel */ + const struct sock *sk = NETLINK_CB(skb).sk; + + /* sk is refcounted, the netns stays alive and nlsk with it */ + return rcu_dereference_protected(net->xfrm.nlsk, sk->sk_net_refcnt); +} + static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type, struct netlink_ext_ack *extack) { @@ -1727,7 +1736,7 @@ static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_spdinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return nlmsg_unicast(xfrm_net_nlsk(net, skb), r_skb, sportid); } static inline unsigned int xfrm_sadinfo_msgsize(void) @@ -1787,7 +1796,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_sadinfo(r_skb, net, sportid, seq, *flags); BUG_ON(err < 0); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid); + return nlmsg_unicast(xfrm_net_nlsk(net, skb), r_skb, sportid); } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1807,7 +1816,7 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(xfrm_net_nlsk(net, skb), resp_skb, NETLINK_CB(skb).portid); } xfrm_state_put(x); out_noput: @@ -1898,7 +1907,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, } } - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(xfrm_net_nlsk(net, skb), resp_skb, NETLINK_CB(skb).portid); out: xfrm_state_put(x); @@ -2543,7 +2552,7 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, r_up->out = net->xfrm.policy_default[XFRM_POLICY_OUT]; nlmsg_end(r_skb, r_nlh); - return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); + return nlmsg_unicast(xfrm_net_nlsk(net, skb), r_skb, portid); } static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2609,7 +2618,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); } else { - err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, + err = nlmsg_unicast(xfrm_net_nlsk(net, skb), resp_skb, NETLINK_CB(skb).portid); } } else { @@ -2782,7 +2791,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, err = build_aevent(r_skb, x, &c); BUG_ON(err < 0); - err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid); + err = nlmsg_unicast(xfrm_net_nlsk(net, skb), r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); xfrm_state_put(x); return err; @@ -3486,7 +3495,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, goto err; } - err = netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c); + err = netlink_dump_start(xfrm_net_nlsk(net, skb), skb, nlh, &c); goto err; } -- cgit v1.2.3 From daf8e3b253aa760ff9e96c7768a464bc1d6b3c90 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Wed, 11 Mar 2026 03:16:29 +0900 Subject: xfrm: Fix work re-schedule after cancel in xfrm_nat_keepalive_net_fini() After cancel_delayed_work_sync() is called from xfrm_nat_keepalive_net_fini(), xfrm_state_fini() flushes remaining states via __xfrm_state_delete(), which calls xfrm_nat_keepalive_state_updated() to re-schedule nat_keepalive_work. The following is a simple race scenario: cpu0 cpu1 cleanup_net() [Round 1] ops_undo_list() xfrm_net_exit() xfrm_nat_keepalive_net_fini() cancel_delayed_work_sync(nat_keepalive_work); xfrm_state_fini() xfrm_state_flush() xfrm_state_delete(x) __xfrm_state_delete(x) xfrm_nat_keepalive_state_updated(x) schedule_delayed_work(nat_keepalive_work); rcu_barrier(); net_complete_free(); net_passive_dec(net); llist_add(&net->defer_free_list, &defer_free_list); cleanup_net() [Round 2] rcu_barrier(); net_complete_free() kmem_cache_free(net_cachep, net); nat_keepalive_work() // on freed net To prevent this, cancel_delayed_work_sync() is replaced with disable_delayed_work_sync(). Fixes: f531d13bdfe3 ("xfrm: support sending NAT keepalives in ESP in UDP states") Signed-off-by: Hyunwoo Kim Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_nat_keepalive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c index ebf95d48e86c..1856beee0149 100644 --- a/net/xfrm/xfrm_nat_keepalive.c +++ b/net/xfrm/xfrm_nat_keepalive.c @@ -261,7 +261,7 @@ int __net_init xfrm_nat_keepalive_net_init(struct net *net) int xfrm_nat_keepalive_net_fini(struct net *net) { - cancel_delayed_work_sync(&net->xfrm.nat_keepalive_work); + disable_delayed_work_sync(&net->xfrm.nat_keepalive_work); return 0; } -- cgit v1.2.3 From 29fe3a61bcdce398ee3955101c39f89c01a8a77e Mon Sep 17 00:00:00 2001 From: Minwoo Ra Date: Sat, 14 Mar 2026 00:58:44 +0900 Subject: xfrm: prevent policy_hthresh.work from racing with netns teardown A XFRM_MSG_NEWSPDINFO request can queue the per-net work item policy_hthresh.work onto the system workqueue. The queued callback, xfrm_hash_rebuild(), retrieves the enclosing struct net via container_of(). If the net namespace is torn down before that work runs, the associated struct net may already have been freed, and xfrm_hash_rebuild() may then dereference stale memory. xfrm_policy_fini() already flushes policy_hash_work during teardown, but it does not synchronize policy_hthresh.work. Synchronize policy_hthresh.work in xfrm_policy_fini() as well, so the queued work cannot outlive the net namespace teardown and access a freed struct net. Fixes: 880a6fab8f6b ("xfrm: configure policy hash table thresholds by netlink") Signed-off-by: Minwoo Ra Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 66ac93e65264..2140ee7b102d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4282,6 +4282,8 @@ static void xfrm_policy_fini(struct net *net) unsigned int sz; int dir; + disable_work_sync(&net->xfrm.policy_hthresh.work); + flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); -- cgit v1.2.3 From eb2d16a7d599dc9d4df391b5e660df9949963786 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 14 Mar 2026 17:02:10 +0000 Subject: af_key: validate families in pfkey_send_migrate() syzbot was able to trigger a crash in skb_put() [1] Issue is that pfkey_send_migrate() does not check old/new families, and that set_ipsecrequest() @family argument was truncated, thus possibly overfilling the skb. Validate families early, do not wait set_ipsecrequest(). [1] skbuff: skb_over_panic: text:ffffffff8a752120 len:392 put:16 head:ffff88802a4ad040 data:ffff88802a4ad040 tail:0x188 end:0x180 dev: kernel BUG at net/core/skbuff.c:214 ! Call Trace: skb_over_panic net/core/skbuff.c:219 [inline] skb_put+0x159/0x210 net/core/skbuff.c:2655 skb_put_zero include/linux/skbuff.h:2788 [inline] set_ipsecrequest net/key/af_key.c:3532 [inline] pfkey_send_migrate+0x1270/0x2e50 net/key/af_key.c:3636 km_migrate+0x155/0x260 net/xfrm/xfrm_state.c:2848 xfrm_migrate+0x2140/0x2450 net/xfrm/xfrm_policy.c:4705 xfrm_do_migrate+0x8ff/0xaa0 net/xfrm/xfrm_user.c:3150 Fixes: 08de61beab8a ("[PFKEYV2]: Extension for dynamic update of endpoint address(es)") Reported-by: syzbot+b518dfc8e021988fbd55@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69b5933c.050a0220.248e02.00f2.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 571200433aa9..bc91aeeb74bb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3518,7 +3518,7 @@ static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress * static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, - uint32_t reqid, uint8_t family, + uint32_t reqid, sa_family_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; @@ -3583,12 +3583,17 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { - /* old locator pair */ - size_pol += sizeof(struct sadb_x_ipsecrequest) + - pfkey_sockaddr_pair_size(mp->old_family); - /* new locator pair */ - size_pol += sizeof(struct sadb_x_ipsecrequest) + - pfkey_sockaddr_pair_size(mp->new_family); + int pair_size; + + pair_size = pfkey_sockaddr_pair_size(mp->old_family); + if (!pair_size) + return -EINVAL; + size_pol += sizeof(struct sadb_x_ipsecrequest) + pair_size; + + pair_size = pfkey_sockaddr_pair_size(mp->new_family); + if (!pair_size) + return -EINVAL; + size_pol += sizeof(struct sadb_x_ipsecrequest) + pair_size; } size += sizeof(struct sadb_msg) + size_pol; -- cgit v1.2.3 From d849a2f7309fc0616e79d13b008b0a47e0458b6e Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Mon, 16 Mar 2026 14:56:51 +0000 Subject: xfrm: iptfs: only publish mode_data after clone setup iptfs_clone_state() stores x->mode_data before allocating the reorder window. If that allocation fails, the code frees the cloned state and returns -ENOMEM, leaving x->mode_data pointing at freed memory. The xfrm clone unwind later runs destroy_state() through x->mode_data, so the failed clone path tears down IPTFS state that clone_state() already freed. Keep the cloned IPTFS state private until all allocations succeed so failed clones leave x->mode_data unset. The destroy path already handles a NULL mode_data pointer. Fixes: 6be02e3e4f37 ("xfrm: iptfs: handle reordering of received packets") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 2c87290fe06c..7cd97c1dcd11 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2664,9 +2664,6 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) if (!xtfs) return -ENOMEM; - x->mode_data = xtfs; - xtfs->x = x; - xtfs->ra_newskb = NULL; if (xtfs->cfg.reorder_win_size) { xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, @@ -2677,6 +2674,9 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) } } + x->mode_data = xtfs; + xtfs->x = x; + return 0; } -- cgit v1.2.3 From 46eee1661aa9b49966e6c43d07126fe408edda57 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 18 Mar 2026 18:34:13 +0100 Subject: can: statistics: add missing atomic access in hot path Commit 80b5f90158d1 ("can: statistics: use atomic access in hot path") fixed a KCSAN issue in can_receive() but missed to convert the 'matches' variable used in can_rcv_filter(). Fixes: 80b5f90158d1 ("can: statistics: use atomic access in hot path") Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260318173413.28235-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 4 ++-- net/can/af_can.h | 2 +- net/can/proc.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index f70e2ba0aadc..7bc86b176b4d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -469,7 +469,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, rcv->can_id = can_id; rcv->mask = mask; - rcv->matches = 0; + atomic_long_set(&rcv->matches, 0); rcv->func = func; rcv->data = data; rcv->ident = ident; @@ -573,7 +573,7 @@ EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); - rcv->matches++; + atomic_long_inc(&rcv->matches); } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) diff --git a/net/can/af_can.h b/net/can/af_can.h index 22f3352c77fe..87887014f562 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -52,7 +52,7 @@ struct receiver { struct hlist_node list; canid_t can_id; canid_t mask; - unsigned long matches; + atomic_long_t matches; void (*func)(struct sk_buff *skb, void *data); void *data; char *ident; diff --git a/net/can/proc.c b/net/can/proc.c index 0938bf7dd646..de4d05ae3459 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -196,7 +196,8 @@ static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list, " %-5s %03x %08x %pK %pK %8ld %s\n"; seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask, - r->func, r->data, r->matches, r->ident); + r->func, r->data, atomic_long_read(&r->matches), + r->ident); } } -- cgit v1.2.3 From b9c310d72783cc2f30d103eed83920a5a29c671a Mon Sep 17 00:00:00 2001 From: Ali Norouzi Date: Thu, 19 Mar 2026 16:47:44 +0100 Subject: can: gw: fix OOB heap access in cgw_csum_crc8_rel() cgw_csum_crc8_rel() correctly computes bounds-safe indices via calc_idx(): int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); if (from < 0 || to < 0 || res < 0) return; However, the loop and the result write then use the raw s8 fields directly instead of the computed variables: for (i = crc8->from_idx; ...) /* BUG: raw negative index */ cf->data[crc8->result_idx] = ...; /* BUG: raw negative index */ With from_idx = to_idx = result_idx = -64 on a 64-byte CAN FD frame, calc_idx(-64, 64) = 0 so the guard passes, but the loop iterates with i = -64, reading cf->data[-64], and the write goes to cf->data[-64]. This write might end up to 56 (7.0-rc) or 40 (<= 6.19) bytes before the start of the canfd_frame on the heap. The companion function cgw_csum_xor_rel() uses `from`/`to`/`res` correctly throughout; fix cgw_csum_crc8_rel() to match. Confirmed with KASAN on linux-7.0-rc2: BUG: KASAN: slab-out-of-bounds in cgw_csum_crc8_rel+0x515/0x5b0 Read of size 1 at addr ffff8880076619c8 by task poc_cgw_oob/62 To configure the can-gw crc8 checksums CAP_NET_ADMIN is needed. Fixes: 456a8a646b25 ("can: gw: add support for CAN FD frames") Cc: stable@vger.kernel.org Reported-by: Ali Norouzi Reviewed-by: Oliver Hartkopp Acked-by: Oliver Hartkopp Signed-off-by: Ali Norouzi Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260319-fix-can-gw-and-can-isotp-v2-1-c45d52c6d2d8@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/gw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index 8ee4d67a07d3..0ec99f68aa45 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -375,10 +375,10 @@ static void cgw_csum_crc8_rel(struct canfd_frame *cf, return; if (from <= to) { - for (i = crc8->from_idx; i <= crc8->to_idx; i++) + for (i = from; i <= to; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { - for (i = crc8->from_idx; i >= crc8->to_idx; i--) + for (i = from; i >= to; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } @@ -397,7 +397,7 @@ static void cgw_csum_crc8_rel(struct canfd_frame *cf, break; } - cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; + cf->data[res] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, -- cgit v1.2.3 From 424e95d62110cdbc8fd12b40918f37e408e35a92 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 19 Mar 2026 16:47:45 +0100 Subject: can: isotp: fix tx.buf use-after-free in isotp_sendmsg() isotp_sendmsg() uses only cmpxchg() on so->tx.state to serialize access to so->tx.buf. isotp_release() waits for ISOTP_IDLE via wait_event_interruptible() and then calls kfree(so->tx.buf). If a signal interrupts the wait_event_interruptible() inside close() while tx.state is ISOTP_SENDING, the loop exits early and release proceeds to force ISOTP_SHUTDOWN and continues to kfree(so->tx.buf) while sendmsg may still be reading so->tx.buf for the final CAN frame in isotp_fill_dataframe(). The so->tx.buf can be allocated once when the standard tx.buf length needs to be extended. Move the kfree() of this potentially extended tx.buf to sk_destruct time when either isotp_sendmsg() and isotp_release() are done. Fixes: 96d1c81e6a04 ("can: isotp: add module parameter for maximum pdu size") Cc: stable@vger.kernel.org Reported-by: Ali Norouzi Co-developed-by: Ali Norouzi Signed-off-by: Ali Norouzi Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260319-fix-can-gw-and-can-isotp-v2-2-c45d52c6d2d8@pengutronix.de Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index da3b72e7afcc..2770f43f4951 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1248,12 +1248,6 @@ static int isotp_release(struct socket *sock) so->ifindex = 0; so->bound = 0; - if (so->rx.buf != so->rx.sbuf) - kfree(so->rx.buf); - - if (so->tx.buf != so->tx.sbuf) - kfree(so->tx.buf); - sock_orphan(sk); sock->sk = NULL; @@ -1622,6 +1616,21 @@ static int isotp_notifier(struct notifier_block *nb, unsigned long msg, return NOTIFY_DONE; } +static void isotp_sock_destruct(struct sock *sk) +{ + struct isotp_sock *so = isotp_sk(sk); + + /* do the standard CAN sock destruct work */ + can_sock_destruct(sk); + + /* free potential extended PDU buffers */ + if (so->rx.buf != so->rx.sbuf) + kfree(so->rx.buf); + + if (so->tx.buf != so->tx.sbuf) + kfree(so->tx.buf); +} + static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); @@ -1666,6 +1675,9 @@ static int isotp_init(struct sock *sk) list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); + /* re-assign default can_sock_destruct() reference */ + sk->sk_destruct = isotp_sock_destruct; + return 0; } -- cgit v1.2.3 From 9d87cb22195b2c67405f5485d525190747ad5493 Mon Sep 17 00:00:00 2001 From: Minseo Park Date: Sun, 15 Mar 2026 22:14:37 +0900 Subject: Bluetooth: L2CAP: Fix stack-out-of-bounds read in l2cap_ecred_conn_req Syzbot reported a KASAN stack-out-of-bounds read in l2cap_build_cmd() that is triggered by a malformed Enhanced Credit Based Connection Request. The vulnerability stems from l2cap_ecred_conn_req(). The function allocates a local stack buffer (`pdu`) designed to hold a maximum of 5 Source Channel IDs (SCIDs), totaling 18 bytes. When an attacker sends a request with more than 5 SCIDs, the function calculates `rsp_len` based on this unvalidated `cmd_len` before checking if the number of SCIDs exceeds L2CAP_ECRED_MAX_CID. If the SCID count is too high, the function correctly jumps to the `response` label to reject the packet, but `rsp_len` retains the attacker's oversized value. Consequently, l2cap_send_cmd() is instructed to read past the end of the 18-byte `pdu` buffer, triggering a KASAN panic. Fix this by moving the assignment of `rsp_len` to after the `num_scid` boundary check. If the packet is rejected, `rsp_len` will safely remain 0, and the error response will only read the 8-byte base header from the stack. Fixes: c28d2bff7044 ("Bluetooth: L2CAP: Fix result of L2CAP_ECRED_CONN_RSP when MTU is too short") Reported-by: syzbot+b7f3e7d9a596bf6a63e3@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b7f3e7d9a596bf6a63e3 Tested-by: syzbot+b7f3e7d9a596bf6a63e3@syzkaller.appspotmail.com Signed-off-by: Minseo Park Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 5deb6c4f1e41..0882b5ac2ecc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5081,14 +5081,14 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); - /* Always respond with the same number of scids as in the request */ - rsp_len = cmd_len; - if (num_scid > L2CAP_ECRED_MAX_CID) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } + /* Always respond with the same number of scids as in the request */ + rsp_len = cmd_len; + mtu = __le16_to_cpu(req->mtu); mps = __le16_to_cpu(req->mps); -- cgit v1.2.3 From c65bd945d1c08c3db756821b6bf9f1c4a77b29c6 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 13 Mar 2026 05:22:39 +0900 Subject: Bluetooth: L2CAP: Validate PDU length before reading SDU length in l2cap_ecred_data_rcv() l2cap_ecred_data_rcv() reads the SDU length field from skb->data using get_unaligned_le16() without first verifying that skb contains at least L2CAP_SDULEN_SIZE (2) bytes. When skb->len is less than 2, this reads past the valid data in the skb. The ERTM reassembly path correctly calls pskb_may_pull() before reading the SDU length (l2cap_reassemble_sdu, L2CAP_SAR_START case). Apply the same validation to the Enhanced Credit Based Flow Control data path. Fixes: aac23bf63659 ("Bluetooth: Implement LE L2CAP reassembly") Signed-off-by: Hyunwoo Kim Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 0882b5ac2ecc..30fd6848938e 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6690,6 +6690,11 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) if (!chan->sdu) { u16 sdu_len; + if (!pskb_may_pull(skb, L2CAP_SDULEN_SIZE)) { + err = -EINVAL; + goto failed; + } + sdu_len = get_unaligned_le16(skb->data); skb_pull(skb, L2CAP_SDULEN_SIZE); -- cgit v1.2.3 From 598dbba9919c5e36c54fe1709b557d64120cb94b Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 13 Mar 2026 05:26:16 +0900 Subject: Bluetooth: SCO: Fix use-after-free in sco_recv_frame() due to missing sock_hold sco_recv_frame() reads conn->sk under sco_conn_lock() but immediately releases the lock without holding a reference to the socket. A concurrent close() can free the socket between the lock release and the subsequent sk->sk_state access, resulting in a use-after-free. Other functions in the same file (sco_sock_timeout(), sco_conn_del()) correctly use sco_sock_hold() to safely hold a reference under the lock. Fix by using sco_sock_hold() to take a reference before releasing the lock, and adding sock_put() on all exit paths. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Hyunwoo Kim Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index e7db50165879..584e059de20a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -401,7 +401,7 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) struct sock *sk; sco_conn_lock(conn); - sk = conn->sk; + sk = sco_sock_hold(conn); sco_conn_unlock(conn); if (!sk) @@ -410,11 +410,15 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) BT_DBG("sk %p len %u", sk, skb->len); if (sk->sk_state != BT_CONNECTED) - goto drop; + goto drop_put; - if (!sock_queue_rcv_skb(sk, skb)) + if (!sock_queue_rcv_skb(sk, skb)) { + sock_put(sk); return; + } +drop_put: + sock_put(sk); drop: kfree_skb(skb); } -- cgit v1.2.3 From 5f5fa4cd35f707344f65ce9e225b6528691dbbaa Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 16 Mar 2026 15:03:27 -0400 Subject: Bluetooth: MGMT: Fix dangling pointer on mgmt_add_adv_patterns_monitor_complete This fixes the condition checking so mgmt_pending_valid is executed whenever status != -ECANCELED otherwise calling mgmt_pending_free(cmd) would kfree(cmd) without unlinking it from the list first, leaving a dangling pointer. Any subsequent list traversal (e.g., mgmt_pending_foreach during __mgmt_power_off, or another mgmt_pending_valid call) would dereference freed memory. Link: https://lore.kernel.org/linux-bluetooth/20260315132013.75ab40c5@kernel.org/T/#m1418f9c82eeff8510c1beaa21cf53af20db96c06 Fixes: 302a1f674c00 ("Bluetooth: MGMT: Fix possible UAFs") Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Paul Menzel --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d52238ce6a9a..e5f9287fb826 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5355,7 +5355,7 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, * hci_adv_monitors_clear is about to be called which will take care of * freeing the adv_monitor instances. */ - if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd)) + if (status == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; monitor = cmd->user_data; -- cgit v1.2.3 From b6807cfc195ef99e1ac37b2e1e60df40295daa8c Mon Sep 17 00:00:00 2001 From: Cen Zhang Date: Sun, 15 Mar 2026 20:07:26 +0800 Subject: Bluetooth: hci_sync: annotate data-races around hdev->req_status __hci_cmd_sync_sk() sets hdev->req_status under hdev->req_lock: hdev->req_status = HCI_REQ_PEND; However, several other functions read or write hdev->req_status without holding any lock: - hci_send_cmd_sync() reads req_status in hci_cmd_work (workqueue) - hci_cmd_sync_complete() reads/writes from HCI event completion - hci_cmd_sync_cancel() / hci_cmd_sync_cancel_sync() read/write - hci_abort_conn() reads in connection abort path Since __hci_cmd_sync_sk() runs on hdev->req_workqueue while hci_send_cmd_sync() runs on hdev->workqueue, these are different workqueues that can execute concurrently on different CPUs. The plain C accesses constitute a data race. Add READ_ONCE()/WRITE_ONCE() annotations on all concurrent accesses to hdev->req_status to prevent potential compiler optimizations that could affect correctness (e.g., load fusing in the wait_event condition or store reordering). Signed-off-by: Cen Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 2 +- net/bluetooth/hci_sync.c | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6eb59e9f2aa8..e6393f17576b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -3095,7 +3095,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) * hci_connect_le serializes the connection attempts so only one * connection can be in BT_CONNECT at time. */ - if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { + if (conn->state == BT_CONNECT && READ_ONCE(hdev->req_status) == HCI_REQ_PEND) { switch (hci_skb_event(hdev->sent_cmd)) { case HCI_EV_CONN_COMPLETE: case HCI_EV_LE_CONN_COMPLETE: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 31308c1de4ec..01f8ceeb1c0c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4126,7 +4126,7 @@ static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb); } - if (hdev->req_status == HCI_REQ_PEND && + if (READ_ONCE(hdev->req_status) == HCI_REQ_PEND && !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) { kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 3166914b0d6c..45d16639874a 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -25,11 +25,11 @@ static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, { bt_dev_dbg(hdev, "result 0x%2.2x", result); - if (hdev->req_status != HCI_REQ_PEND) + if (READ_ONCE(hdev->req_status) != HCI_REQ_PEND) return; hdev->req_result = result; - hdev->req_status = HCI_REQ_DONE; + WRITE_ONCE(hdev->req_status, HCI_REQ_DONE); /* Free the request command so it is not used as response */ kfree_skb(hdev->req_skb); @@ -167,20 +167,20 @@ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, hci_cmd_sync_add(&req, opcode, plen, param, event, sk); - hdev->req_status = HCI_REQ_PEND; + WRITE_ONCE(hdev->req_status, HCI_REQ_PEND); err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); err = wait_event_interruptible_timeout(hdev->req_wait_q, - hdev->req_status != HCI_REQ_PEND, + READ_ONCE(hdev->req_status) != HCI_REQ_PEND, timeout); if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); - switch (hdev->req_status) { + switch (READ_ONCE(hdev->req_status)) { case HCI_REQ_DONE: err = -bt_to_errno(hdev->req_result); break; @@ -194,7 +194,7 @@ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, break; } - hdev->req_status = 0; + WRITE_ONCE(hdev->req_status, 0); hdev->req_result = 0; skb = hdev->req_rsp; hdev->req_rsp = NULL; @@ -665,9 +665,9 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); - if (hdev->req_status == HCI_REQ_PEND) { + if (READ_ONCE(hdev->req_status) == HCI_REQ_PEND) { hdev->req_result = err; - hdev->req_status = HCI_REQ_CANCELED; + WRITE_ONCE(hdev->req_status, HCI_REQ_CANCELED); queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); } @@ -683,12 +683,12 @@ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); - if (hdev->req_status == HCI_REQ_PEND) { + if (READ_ONCE(hdev->req_status) == HCI_REQ_PEND) { /* req_result is __u32 so error must be positive to be properly * propagated. */ hdev->req_result = err < 0 ? -err : err; - hdev->req_status = HCI_REQ_CANCELED; + WRITE_ONCE(hdev->req_status, HCI_REQ_CANCELED); wake_up_interruptible(&hdev->req_wait_q); } -- cgit v1.2.3 From b6552e0503973daf6f23bd6ed9273ef131ee364f Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Thu, 19 Mar 2026 08:58:01 -0300 Subject: Bluetooth: L2CAP: Fix null-ptr-deref on l2cap_sock_ready_cb Before using sk pointer, check if it is null. Fix the following: KASAN: null-ptr-deref in range [0x0000000000000260-0x0000000000000267] CPU: 0 UID: 0 PID: 5985 Comm: kworker/0:5 Not tainted 7.0.0-rc4-00029-ga989fde763f4 #1 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-9.fc43 06/10/2025 Workqueue: events l2cap_info_timeout RIP: 0010:kasan_byte_accessible+0x12/0x30 Code: 79 ff ff ff 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 40 d6 48 c1 ef 03 48 b8 00 00 00 00 00 fc ff df <0f> b6 04 07 3c 08 0f 92 c0 c3 cc cce veth0_macvtap: entered promiscuous mode RSP: 0018:ffffc90006e0f808 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffffffff89746018 RCX: 0000000080000001 RDX: 0000000000000000 RSI: ffffffff89746018 RDI: 000000000000004c RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: dffffc0000000000 R11: ffffffff8aae3e70 R12: 0000000000000000 R13: 0000000000000260 R14: 0000000000000260 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880983c2000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005582615a5008 CR3: 000000007007e000 CR4: 0000000000752ef0 PKRU: 55555554 Call Trace: __kasan_check_byte+0x12/0x40 lock_acquire+0x79/0x2e0 lock_sock_nested+0x48/0x100 ? l2cap_sock_ready_cb+0x46/0x160 l2cap_sock_ready_cb+0x46/0x160 l2cap_conn_start+0x779/0xff0 ? __pfx_l2cap_conn_start+0x10/0x10 ? l2cap_info_timeout+0x60/0xa0 ? __pfx___mutex_lock+0x10/0x10 l2cap_info_timeout+0x68/0xa0 ? process_scheduled_works+0xa8d/0x18c0 process_scheduled_works+0xb6e/0x18c0 ? __pfx_process_scheduled_works+0x10/0x10 ? assign_work+0x3d5/0x5e0 worker_thread+0xa53/0xfc0 kthread+0x388/0x470 ? __pfx_worker_thread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x51e/0xb90 ? __pfx_ret_from_fork+0x10/0x10 veth1_macvtap: entered promiscuous mode ? __switch_to+0xc7d/0x1450 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Modules linked in: ---[ end trace 0000000000000000 ]--- batman_adv: batadv0: Interface activated: batadv_slave_0 batman_adv: batadv0: Interface activated: batadv_slave_1 netdevsim netdevsim7 netdevsim0: set [1, 0] type 2 family 0 port 6081 - 0 netdevsim netdevsim7 netdevsim1: set [1, 0] type 2 family 0 port 6081 - 0 netdevsim netdevsim7 netdevsim2: set [1, 0] type 2 family 0 port 6081 - 0 netdevsim netdevsim7 netdevsim3: set [1, 0] type 2 family 0 port 6081 - 0 RIP: 0010:kasan_byte_accessible+0x12/0x30 Code: 79 ff ff ff 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 40 d6 48 c1 ef 03 48 b8 00 00 00 00 00 fc ff df <0f> b6 04 07 3c 08 0f 92 c0 c3 cc cce ieee80211 phy39: Selected rate control algorithm 'minstrel_ht' RSP: 0018:ffffc90006e0f808 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffffffff89746018 RCX: 0000000080000001 RDX: 0000000000000000 RSI: ffffffff89746018 RDI: 000000000000004c RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: dffffc0000000000 R11: ffffffff8aae3e70 R12: 0000000000000000 R13: 0000000000000260 R14: 0000000000000260 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880983c2000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7e16139e9c CR3: 000000000e74e000 CR4: 0000000000752ef0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Fixes: 54a59aa2b562 ("Bluetooth: Add l2cap_chan->ops->ready()") Signed-off-by: Helen Koike Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 597686790371..71e8c1b45bce 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1698,6 +1698,9 @@ static void l2cap_sock_ready_cb(struct l2cap_chan *chan) struct sock *sk = chan->data; struct sock *parent; + if (!sk) + return; + lock_sock(sk); parent = bt_sk(sk)->parent; -- cgit v1.2.3 From 761fb8ec8778f0caf2bba5a41e3cff1ea86974f3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 17 Mar 2026 11:54:01 -0400 Subject: Bluetooth: L2CAP: Fix regressions caused by reusing ident This attempt to fix regressions caused by reusing ident which apparently is not handled well on certain stacks causing the stack to not respond to requests, so instead of simple returning the first unallocated id this stores the last used tx_ident and then attempt to use the next until all available ids are exausted and then cycle starting over to 1. Link: https://bugzilla.kernel.org/show_bug.cgi?id=221120 Link: https://bugzilla.kernel.org/show_bug.cgi?id=221177 Fixes: 6c3ea155e5ee ("Bluetooth: L2CAP: Fix not tracking outstanding TX ident") Signed-off-by: Luiz Augusto von Dentz Tested-by: Christian Eggers --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_core.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 010f1a8fd15f..5172afee5494 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -658,6 +658,7 @@ struct l2cap_conn { struct sk_buff *rx_skb; __u32 rx_len; struct ida tx_ida; + __u8 tx_ident; struct sk_buff_head pending_rx; struct work_struct pending_rx_work; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 30fd6848938e..3de3e3c8e966 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -926,16 +926,39 @@ int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator) static int l2cap_get_ident(struct l2cap_conn *conn) { + u8 max; + int ident; + /* LE link does not support tools like l2ping so use the full range */ if (conn->hcon->type == LE_LINK) - return ida_alloc_range(&conn->tx_ida, 1, 255, GFP_ATOMIC); - + max = 255; /* Get next available identificator. * 1 - 128 are used by kernel. * 129 - 199 are reserved. * 200 - 254 are used by utilities like l2ping, etc. */ - return ida_alloc_range(&conn->tx_ida, 1, 128, GFP_ATOMIC); + else + max = 128; + + /* Allocate ident using min as last used + 1 (cyclic) */ + ident = ida_alloc_range(&conn->tx_ida, READ_ONCE(conn->tx_ident) + 1, + max, GFP_ATOMIC); + /* Force min 1 to start over */ + if (ident <= 0) { + ident = ida_alloc_range(&conn->tx_ida, 1, max, GFP_ATOMIC); + if (ident <= 0) { + /* If all idents are in use, log an error, this is + * extremely unlikely to happen and would indicate a bug + * in the code that idents are not being freed properly. + */ + BT_ERR("Unable to allocate ident: %d", ident); + return 0; + } + } + + WRITE_ONCE(conn->tx_ident, ident); + + return ident; } static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, -- cgit v1.2.3 From 4527025d440ce84bf56e75ce1df2e84cb8178616 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Mar 2026 12:33:34 -0700 Subject: nfc: nci: fix circular locking dependency in nci_close_device nci_close_device() flushes rx_wq and tx_wq while holding req_lock. This causes a circular locking dependency because nci_rx_work() running on rx_wq can end up taking req_lock too: nci_rx_work -> nci_rx_data_packet -> nci_data_exchange_complete -> __sk_destruct -> rawsock_destruct -> nfc_deactivate_target -> nci_deactivate_target -> nci_request -> mutex_lock(&ndev->req_lock) Move the flush of rx_wq after req_lock has been released. This should safe (I think) because NCI_UP has already been cleared and the transport is closed, so the work will see it and return -ENETDOWN. NIPA has been hitting this running the nci selftest with a debug kernel on roughly 4% of the runs. Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reviewed-by: Ian Ray Link: https://patch.msgid.link/20260317193334.988609-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 43d871525dbc..5f46c4b5720f 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -579,8 +579,7 @@ static int nci_close_device(struct nci_dev *ndev) skb_queue_purge(&ndev->rx_q); skb_queue_purge(&ndev->tx_q); - /* Flush RX and TX wq */ - flush_workqueue(ndev->rx_wq); + /* Flush TX wq, RX wq flush can't be under the lock */ flush_workqueue(ndev->tx_wq); /* Reset device */ @@ -592,13 +591,13 @@ static int nci_close_device(struct nci_dev *ndev) msecs_to_jiffies(NCI_RESET_TIMEOUT)); /* After this point our queues are empty - * and no works are scheduled. + * rx work may be running but will see that NCI_UP was cleared */ ndev->ops->close(ndev); clear_bit(NCI_INIT, &ndev->flags); - /* Flush cmd wq */ + /* Flush cmd and tx wq */ flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); @@ -613,6 +612,9 @@ static int nci_close_device(struct nci_dev *ndev) mutex_unlock(&ndev->req_lock); + /* rx_work may take req_lock via nci_deactivate_target */ + flush_workqueue(ndev->rx_wq); + return 0; } -- cgit v1.2.3 From 7c770dadfda5cbbde6aa3c4363ed513f1d212bf8 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 18 Mar 2026 16:55:51 +0100 Subject: net: openvswitch: Avoid releasing netdev before teardown completes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch cited in the Fixes tag below changed the teardown code for OVS ports to no longer unconditionally take the RTNL. After this change, the netdev_destroy() callback can proceed immediately to the call_rcu() invocation if the IFF_OVS_DATAPATH flag is already cleared on the netdev. The ovs_netdev_detach_dev() function clears the flag before completing the unregistration, and if it gets preempted after clearing the flag (as can happen on an -rt kernel), netdev_destroy() can complete and the device can be freed before the unregistration completes. This leads to a splat like: [ 998.393867] Oops: general protection fault, probably for non-canonical address 0xff00000001000239: 0000 [#1] SMP PTI [ 998.393877] CPU: 42 UID: 0 PID: 55177 Comm: ip Kdump: loaded Not tainted 6.12.0-211.1.1.el10_2.x86_64+rt #1 PREEMPT_RT [ 998.393886] Hardware name: Dell Inc. PowerEdge R740/0JMK61, BIOS 2.24.0 03/27/2025 [ 998.393889] RIP: 0010:dev_set_promiscuity+0x8d/0xa0 [ 998.393901] Code: 00 00 75 d8 48 8b 53 08 48 83 ba b0 02 00 00 00 75 ca 48 83 c4 08 5b c3 cc cc cc cc 48 83 bf 48 09 00 00 00 75 91 48 8b 47 08 <48> 83 b8 b0 02 00 00 00 74 97 eb 81 0f 1f 80 00 00 00 00 90 90 90 [ 998.393906] RSP: 0018:ffffce5864a5f6a0 EFLAGS: 00010246 [ 998.393912] RAX: ff00000000ffff89 RBX: ffff894d0adf5a05 RCX: 0000000000000000 [ 998.393917] RDX: 0000000000000000 RSI: 00000000ffffffff RDI: ffff894d0adf5a05 [ 998.393921] RBP: ffff894d19252000 R08: ffff894d19252000 R09: 0000000000000000 [ 998.393924] R10: ffff894d19252000 R11: ffff894d192521b8 R12: 0000000000000006 [ 998.393927] R13: ffffce5864a5f738 R14: 00000000ffffffe2 R15: 0000000000000000 [ 998.393931] FS: 00007fad61971800(0000) GS:ffff894cc0140000(0000) knlGS:0000000000000000 [ 998.393936] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 998.393940] CR2: 000055df0a2a6e40 CR3: 000000011c7fe003 CR4: 00000000007726f0 [ 998.393944] PKRU: 55555554 [ 998.393946] Call Trace: [ 998.393949] [ 998.393952] ? show_trace_log_lvl+0x1b0/0x2f0 [ 998.393961] ? show_trace_log_lvl+0x1b0/0x2f0 [ 998.393975] ? dp_device_event+0x41/0x80 [openvswitch] [ 998.394009] ? __die_body.cold+0x8/0x12 [ 998.394016] ? die_addr+0x3c/0x60 [ 998.394027] ? exc_general_protection+0x16d/0x390 [ 998.394042] ? asm_exc_general_protection+0x26/0x30 [ 998.394058] ? dev_set_promiscuity+0x8d/0xa0 [ 998.394066] ? ovs_netdev_detach_dev+0x3a/0x80 [openvswitch] [ 998.394092] dp_device_event+0x41/0x80 [openvswitch] [ 998.394102] notifier_call_chain+0x5a/0xd0 [ 998.394106] unregister_netdevice_many_notify+0x51b/0xa60 [ 998.394110] rtnl_dellink+0x169/0x3e0 [ 998.394121] ? rt_mutex_slowlock.constprop.0+0x95/0xd0 [ 998.394125] rtnetlink_rcv_msg+0x142/0x3f0 [ 998.394128] ? avc_has_perm_noaudit+0x69/0xf0 [ 998.394130] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 998.394132] netlink_rcv_skb+0x50/0x100 [ 998.394138] netlink_unicast+0x292/0x3f0 [ 998.394141] netlink_sendmsg+0x21b/0x470 [ 998.394145] ____sys_sendmsg+0x39d/0x3d0 [ 998.394149] ___sys_sendmsg+0x9a/0xe0 [ 998.394156] __sys_sendmsg+0x7a/0xd0 [ 998.394160] do_syscall_64+0x7f/0x170 [ 998.394162] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 998.394165] RIP: 0033:0x7fad61bf4724 [ 998.394188] Code: 89 02 b8 ff ff ff ff eb bb 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 80 3d c5 e9 0c 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 48 83 ec 28 89 54 24 1c 48 89 [ 998.394189] RSP: 002b:00007ffd7e2f7cb8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e [ 998.394191] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fad61bf4724 [ 998.394193] RDX: 0000000000000000 RSI: 00007ffd7e2f7d20 RDI: 0000000000000003 [ 998.394194] RBP: 00007ffd7e2f7d90 R08: 0000000000000010 R09: 000000000000003f [ 998.394195] R10: 000055df11558010 R11: 0000000000000202 R12: 00007ffd7e2f8380 [ 998.394196] R13: 0000000069b233d7 R14: 000055df0a256040 R15: 0000000000000000 [ 998.394200] To fix this, reorder the operations in ovs_netdev_detach_dev() to only clear the flag after completing the other operations, and introduce an smp_wmb() to make the ordering requirement explicit. The smp_wmb() is paired with a full smp_mb() in netdev_destroy() to make sure the call_rcu() invocation does not happen before the unregister operations are visible. Reported-by: Minxi Hou Tested-by: Minxi Hou Fixes: 549822767630 ("net: openvswitch: Avoid needlessly taking the RTNL on vport destroy") Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260318155554.1133405-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/openvswitch/vport-netdev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 6574f9bcdc02..c688dee96503 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -151,11 +151,15 @@ static void vport_netdev_free(struct rcu_head *rcu) void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); - vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); + + /* paired with smp_mb() in netdev_destroy() */ + smp_wmb(); + + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; } static void netdev_destroy(struct vport *vport) @@ -174,6 +178,9 @@ static void netdev_destroy(struct vport *vport) rtnl_unlock(); } + /* paired with smp_wmb() in ovs_netdev_detach_dev() */ + smp_mb(); + call_rcu(&vport->rcu, vport_netdev_free); } -- cgit v1.2.3 From 6931d21f87bc6d657f145798fad0bf077b82486c Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 19 Mar 2026 07:42:41 +0000 Subject: openvswitch: defer tunnel netdev_put to RCU release ovs_netdev_tunnel_destroy() may run after NETDEV_UNREGISTER already detached the device. Dropping the netdev reference in destroy can race with concurrent readers that still observe vport->dev. Do not release vport->dev in ovs_netdev_tunnel_destroy(). Instead, let vport_netdev_free() drop the reference from the RCU callback, matching the non-tunnel destroy path and avoiding additional synchronization under RTNL. Fixes: a9020fde67a6 ("openvswitch: Move tunnel destroy function to oppenvswitch module.") Reported-by: Yifan Wu Reported-by: Juefei Pu Tested-by: Ao Zhou Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Yang Yang Reviewed-by: Ilya Maximets Link: https://patch.msgid.link/20260319074241.3405262-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/openvswitch/vport-netdev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index c688dee96503..12055af832dc 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -196,8 +196,6 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); - netdev_put(vport->dev, &vport->dev_tracker); - vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); -- cgit v1.2.3 From 546b68ac893595877ffbd7751e5c55fd1c43ede6 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 19 Mar 2026 08:02:27 +0000 Subject: openvswitch: validate MPLS set/set_masked payload length validate_set() accepted OVS_KEY_ATTR_MPLS as variable-sized payload for SET/SET_MASKED actions. In action handling, OVS expects fixed-size MPLS key data (struct ovs_key_mpls). Use the already normalized key_len (masked case included) and reject non-matching MPLS action key sizes. Reject invalid MPLS action payload lengths early. Fixes: fbdcdd78da7c ("Change in Openvswitch to support MPLS label depth of 3 in ingress direction") Reported-by: Yifan Wu Reported-by: Juefei Pu Tested-by: Ao Zhou Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Yang Yang Reviewed-by: Ilya Maximets Link: https://patch.msgid.link/20260319080228.3423307-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/openvswitch/flow_netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 67fbf6e48a30..13052408a132 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2953,6 +2953,8 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_MPLS: if (!eth_p_mpls(eth_type)) return -EINVAL; + if (key_len != sizeof(struct ovs_key_mpls)) + return -EINVAL; break; case OVS_KEY_ATTR_SCTP: -- cgit v1.2.3 From 24dd586bb4cbba1889a50abe74143817a095c1c9 Mon Sep 17 00:00:00 2001 From: Qi Tang Date: Wed, 18 Mar 2026 14:48:47 +0800 Subject: net/smc: fix double-free of smc_spd_priv when tee() duplicates splice pipe buffer smc_rx_splice() allocates one smc_spd_priv per pipe_buffer and stores the pointer in pipe_buffer.private. The pipe_buf_operations for these buffers used .get = generic_pipe_buf_get, which only increments the page reference count when tee(2) duplicates a pipe buffer. The smc_spd_priv pointer itself was not handled, so after tee() both the original and the cloned pipe_buffer share the same smc_spd_priv *. When both pipes are subsequently released, smc_rx_pipe_buf_release() is called twice against the same object: 1st call: kfree(priv) sock_put(sk) smc_rx_update_cons() [correct] 2nd call: kfree(priv) sock_put(sk) smc_rx_update_cons() [UAF] KASAN reports a slab-use-after-free in smc_rx_pipe_buf_release(), which then escalates to a NULL-pointer dereference and kernel panic via smc_rx_update_consumer() when it chases the freed priv->smc pointer: BUG: KASAN: slab-use-after-free in smc_rx_pipe_buf_release+0x78/0x2a0 Read of size 8 at addr ffff888004a45740 by task smc_splice_tee_/74 Call Trace: dump_stack_lvl+0x53/0x70 print_report+0xce/0x650 kasan_report+0xc6/0x100 smc_rx_pipe_buf_release+0x78/0x2a0 free_pipe_info+0xd4/0x130 pipe_release+0x142/0x160 __fput+0x1c6/0x490 __x64_sys_close+0x4f/0x90 do_syscall_64+0xa6/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f BUG: kernel NULL pointer dereference, address: 0000000000000020 RIP: 0010:smc_rx_update_consumer+0x8d/0x350 Call Trace: smc_rx_pipe_buf_release+0x121/0x2a0 free_pipe_info+0xd4/0x130 pipe_release+0x142/0x160 __fput+0x1c6/0x490 __x64_sys_close+0x4f/0x90 do_syscall_64+0xa6/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Kernel panic - not syncing: Fatal exception Beyond the memory-safety problem, duplicating an SMC splice buffer is semantically questionable: smc_rx_update_cons() would advance the consumer cursor twice for the same data, corrupting receive-window accounting. A refcount on smc_spd_priv could fix the double-free, but the cursor-accounting issue would still need to be addressed separately. The .get callback is invoked by both tee(2) and splice_pipe_to_pipe() for partial transfers; both will now return -EFAULT. Users who need to duplicate SMC socket data must use a copy-based read path. Fixes: 9014db202cb7 ("smc: add support for splice()") Signed-off-by: Qi Tang Link: https://patch.msgid.link/20260318064847.23341-1-tpluszz77@gmail.com Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index d833e36f7fd4..c1d9b923938d 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -135,9 +135,16 @@ out: sock_put(sk); } +static bool smc_rx_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + /* smc_spd_priv in buf->private is not shareable; disallow cloning. */ + return false; +} + static const struct pipe_buf_operations smc_pipe_ops = { .release = smc_rx_pipe_buf_release, - .get = generic_pipe_buf_get + .get = smc_rx_pipe_buf_get, }; static void smc_rx_spd_release(struct splice_pipe_desc *spd, -- cgit v1.2.3 From 52501989c76206462d9b11a8485beef40ef41821 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 20 Mar 2026 00:02:52 +0100 Subject: rtnetlink: count IFLA_PARENT_DEV_{NAME,BUS_NAME} in if_nlmsg_size Commit 00e77ed8e64d ("rtnetlink: add IFLA_PARENT_[DEV|DEV_BUS]_NAME") added those attributes to rtnl_fill_ifinfo, but forgot to extend if_nlmsg_size. Fixes: 00e77ed8e64d ("rtnetlink: add IFLA_PARENT_[DEV|DEV_BUS]_NAME") Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/0b849da95562af45487080528d60f578636aba5c.1773919462.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dad4b1054955..0fe279432d82 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1267,6 +1267,21 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) return size; } +static size_t rtnl_dev_parent_size(const struct net_device *dev) +{ + size_t size = 0; + + /* IFLA_PARENT_DEV_NAME */ + if (dev->dev.parent) + size += nla_total_size(strlen(dev_name(dev->dev.parent)) + 1); + + /* IFLA_PARENT_DEV_BUS_NAME */ + if (dev->dev.parent && dev->dev.parent->bus) + size += nla_total_size(strlen(dev->dev.parent->bus->name) + 1); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1328,6 +1343,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + nla_total_size(2) /* IFLA_HEADROOM */ + nla_total_size(2) /* IFLA_TAILROOM */ + + rtnl_dev_parent_size(dev) + 0; if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) -- cgit v1.2.3 From ee00a12593ffb69db4dd1a1c00ecb0253376874a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 20 Mar 2026 00:02:53 +0100 Subject: rtnetlink: count IFLA_INFO_SLAVE_KIND in if_nlmsg_size rtnl_link_get_slave_info_data_size counts IFLA_INFO_SLAVE_DATA, but rtnl_link_slave_info_fill adds both IFLA_INFO_SLAVE_DATA and IFLA_INFO_SLAVE_KIND. Fixes: ba7d49b1f0f8 ("rtnetlink: provide api for getting and setting slave info") Reviewed-by: Jiri Pirko Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/049843b532e23cde7ddba263c0bbe35ba6f0d26d.1773919462.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0fe279432d82..4a2278614250 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -707,11 +707,14 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) goto out; ops = master_dev->rtnl_link_ops; - if (!ops || !ops->get_slave_size) + if (!ops) + goto out; + size += nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_SLAVE_KIND */ + if (!ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ - size = nla_total_size(sizeof(struct nlattr)) + - ops->get_slave_size(master_dev, dev); + size += nla_total_size(sizeof(struct nlattr)) + + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); -- cgit v1.2.3 From 6af51e9f31336632263c4680b2a3712295103e1f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 20 Mar 2026 07:22:59 +0000 Subject: ipv6: Remove permanent routes from tb6_gc_hlist when all exceptions expire. Commit 5eb902b8e719 ("net/ipv6: Remove expired routes with a separated list of routes.") introduced a per-table GC list and changed GC to iterate over that list instead of traversing the entire route table. However, it forgot to add permanent routes to tb6_gc_hlist when exception routes are added. Commit cfe82469a00f ("ipv6: add exception routes to GC list in rt6_insert_exception") fixed that issue but introduced another one. Even after all exception routes expire, the permanent routes remain in tb6_gc_hlist, potentially negating the performance benefits intended by the initial change. Let's count gc_args->more before and after rt6_age_exceptions() and remove the permanent route when the delta is 0. Note that the next patch will reuse fib6_age_exceptions(). Fixes: cfe82469a00f ("ipv6: add exception routes to GC list in rt6_insert_exception") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Xin Long Reviewed-by: David Ahern Link: https://patch.msgid.link/20260320072317.2561779-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9058e71241dc..fadfca49d6b1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2348,6 +2348,17 @@ static void fib6_flush_trees(struct net *net) /* * Garbage collection */ +static void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now) +{ + bool may_expire = rt->fib6_flags & RTF_EXPIRES && rt->expires; + int old_more = gc_args->more; + + rt6_age_exceptions(rt, gc_args, now); + + if (!may_expire && old_more == gc_args->more) + fib6_remove_gc_list(rt); +} static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { @@ -2370,7 +2381,7 @@ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) * Note, that clones are aged out * only if they are not in use now. */ - rt6_age_exceptions(rt, gc_args, now); + fib6_age_exceptions(rt, gc_args, now); return 0; } -- cgit v1.2.3 From 4be7b99c253f0c85a255cc1db7127ba3232dfa30 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 20 Mar 2026 07:23:00 +0000 Subject: ipv6: Don't remove permanent routes with exceptions from tb6_gc_hlist. The cited commit mechanically put fib6_remove_gc_list() just after every fib6_clean_expires() call. When a temporary route is promoted to a permanent route, there may already be exception routes tied to it. If fib6_remove_gc_list() removes the route from tb6_gc_hlist, such exception routes will no longer be aged. Let's replace fib6_remove_gc_list() with a new helper fib6_may_remove_gc_list() and use fib6_age_exceptions() there. Note that net->ipv6 is only compiled when CONFIG_IPV6 is enabled, so fib6_{add,remove,may_remove}_gc_list() are guarded. Fixes: 5eb902b8e719 ("net/ipv6: Remove expired routes with a separated list of routes.") Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20260320072317.2561779-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 21 ++++++++++++++++++++- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6_fib.c | 6 +++--- net/ipv6/route.c | 2 +- 4 files changed, 26 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 88b0dd4d8e09..9f8b6814a96a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -507,12 +507,14 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); +void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); void fib6_run_gc(unsigned long expires, struct net *net, bool force); - void fib6_gc_cleanup(void); int fib6_init(void); +#if IS_ENABLED(CONFIG_IPV6) /* Add the route to the gc list if it is not already there * * The callers should hold f6i->fib6_table->tb6_lock. @@ -545,6 +547,23 @@ static inline void fib6_remove_gc_list(struct fib6_info *f6i) hlist_del_init(&f6i->gc_link); } +static inline void fib6_may_remove_gc_list(struct net *net, + struct fib6_info *f6i) +{ + struct fib6_gc_args gc_args; + + if (hlist_unhashed(&f6i->gc_link)) + return; + + gc_args.timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval); + gc_args.more = 0; + + rcu_read_lock(); + fib6_age_exceptions(f6i, &gc_args, jiffies); + rcu_read_unlock(); +} +#endif + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0e55f139e05d..f4e23b543585 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2862,7 +2862,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) fib6_add_gc_list(rt); } else { fib6_clean_expires(rt); - fib6_remove_gc_list(rt); + fib6_may_remove_gc_list(net, rt); } spin_unlock_bh(&table->tb6_lock); @@ -4840,7 +4840,7 @@ static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, if (!(flags & RTF_EXPIRES)) { fib6_clean_expires(f6i); - fib6_remove_gc_list(f6i); + fib6_may_remove_gc_list(net, f6i); } else { fib6_set_expires(f6i, expires); fib6_add_gc_list(f6i); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index fadfca49d6b1..dd26657b6a4a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1133,7 +1133,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); - fib6_remove_gc_list(iter); + fib6_may_remove_gc_list(info->nl_net, iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); @@ -2348,8 +2348,8 @@ static void fib6_flush_trees(struct net *net) /* * Garbage collection */ -static void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, - unsigned long now) +void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now) { bool may_expire = rt->fib6_flags & RTF_EXPIRES && rt->expires; int old_more = gc_args->more; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 08cd86f49bf9..cb521700cee7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1033,7 +1033,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); - fib6_remove_gc_list(rt); + fib6_may_remove_gc_list(net, rt); } else { fib6_set_expires(rt, jiffies + HZ * lifetime); fib6_add_gc_list(rt); -- cgit v1.2.3 From 42156f93d123436f2a27c468f18c966b7e5db796 Mon Sep 17 00:00:00 2001 From: Yochai Eisenrich Date: Thu, 19 Mar 2026 22:06:10 +0200 Subject: net: fix fanout UAF in packet_release() via NETDEV_UP race `packet_release()` has a race window where `NETDEV_UP` can re-register a socket into a fanout group's `arr[]` array. The re-registration is not cleaned up by `fanout_release()`, leaving a dangling pointer in the fanout array. `packet_release()` does NOT zero `po->num` in its `bind_lock` section. After releasing `bind_lock`, `po->num` is still non-zero and `po->ifindex` still matches the bound device. A concurrent `packet_notifier(NETDEV_UP)` that already found the socket in `sklist` can re-register the hook. For fanout sockets, this re-registration calls `__fanout_link(sk, po)` which adds the socket back into `f->arr[]` and increments `f->num_members`, but does NOT increment `f->sk_ref`. The fix sets `po->num` to zero in `packet_release` while `bind_lock` is held to prevent NETDEV_UP from linking, preventing the race window. This bug was found following an additional audit with Claude Code based on CVE-2025-38617. Fixes: ce06b03e60fc ("packet: Add helpers to register/unregister ->prot_hook") Link: https://blog.calif.io/p/a-race-within-a-race-exploiting-cve Signed-off-by: Yochai Eisenrich Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260319200610.25101-1-echelonh@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 72d0935139f0..bb2d88205e5a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3135,6 +3135,7 @@ static int packet_release(struct socket *sock) spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); + WRITE_ONCE(po->num, 0); packet_cached_dev_reset(po); if (po->prot_hook.dev) { -- cgit v1.2.3 From e537dd15d0d4ad989d56a1021290f0c674dd8b28 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 19 Mar 2026 11:18:17 -0700 Subject: udp: Fix wildcard bind conflict check when using hash2 When binding a udp_sock to a local address and port, UDP uses two hashes (udptable->hash and udptable->hash2) for collision detection. The current code switches to "hash2" when hslot->count > 10. "hash2" is keyed by local address and local port. "hash" is keyed by local port only. The issue can be shown in the following bind sequence (pseudo code): bind(fd1, "[fd00::1]:8888") bind(fd2, "[fd00::2]:8888") bind(fd3, "[fd00::3]:8888") bind(fd4, "[fd00::4]:8888") bind(fd5, "[fd00::5]:8888") bind(fd6, "[fd00::6]:8888") bind(fd7, "[fd00::7]:8888") bind(fd8, "[fd00::8]:8888") bind(fd9, "[fd00::9]:8888") bind(fd10, "[fd00::10]:8888") /* Correctly return -EADDRINUSE because "hash" is used * instead of "hash2". udp_lib_lport_inuse() detects the * conflict. */ bind(fail_fd, "[::]:8888") /* After one more socket is bound to "[fd00::11]:8888", * hslot->count exceeds 10 and "hash2" is used instead. */ bind(fd11, "[fd00::11]:8888") bind(fail_fd, "[::]:8888") /* succeeds unexpectedly */ The same issue applies to the IPv4 wildcard address "0.0.0.0" and the IPv4-mapped wildcard address "::ffff:0.0.0.0". For example, if there are existing sockets bound to "192.168.1.[1-11]:8888", then binding "0.0.0.0:8888" or "[::ffff:0.0.0.0]:8888" can also miss the conflict when hslot->count > 10. TCP inet_csk_get_port() already has the correct check in inet_use_bhash2_on_bind(). Rename it to inet_use_hash2_on_bind() and move it to inet_hashtables.h so udp.c can reuse it in this fix. Fixes: 30fff9231fad ("udp: bind() optimisation") Reported-by: Andrew Onyshchuk Signed-off-by: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260319181817.1901357-1-martin.lau@linux.dev Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 14 ++++++++++++++ net/ipv4/inet_connection_sock.c | 20 +++----------------- net/ipv4/udp.c | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5a979dcab538..6d936e9f2fd3 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -264,6 +264,20 @@ inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } +static inline bool inet_use_hash2_on_bind(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return false; + + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return true; + } +#endif + return sk->sk_rcv_saddr != htonl(INADDR_ANY); +} + struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5dfac6ce1110..e961936b6be7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -154,20 +154,6 @@ bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) } EXPORT_SYMBOL(inet_sk_get_local_port_range); -static bool inet_use_bhash2_on_bind(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - return false; - - if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return true; - } -#endif - return sk->sk_rcv_saddr != htonl(INADDR_ANY); -} - static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) @@ -259,7 +245,7 @@ static int inet_csk_bind_conflict(const struct sock *sk, * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ - if (inet_use_bhash2_on_bind(sk)) + if (inet_use_hash2_on_bind(sk)) return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); @@ -376,7 +362,7 @@ other_parity_scan: head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); - if (inet_use_bhash2_on_bind(sk)) { + if (inet_use_hash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } @@ -562,7 +548,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) check_bind_conflict = false; } - if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { + if (check_bind_conflict && inet_use_hash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b60fad393e18..cb99a3c27053 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -287,7 +287,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, } else { hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); - if (hslot->count > 10) { + if (inet_use_hash2_on_bind(sk) && hslot->count > 10) { int exist; unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; -- cgit v1.2.3 From c4336a07eb6b2526dc2b62928b5104b41a7f81f5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 20 Mar 2026 15:01:46 -0400 Subject: net: correctly handle tunneled traffic on IPV6_CSUM GSO fallback NETIF_F_IPV6_CSUM only advertises support for checksum offload of packets without IPv6 extension headers. Packets with extension headers must fall back onto software checksumming. Since TSO depends on checksum offload, those must revert to GSO. The below commit introduces that fallback. It always checks network header length. For tunneled packets, the inner header length must be checked instead. Extend the check accordingly. A special case is tunneled packets without inner IP protocol. Such as RFC 6951 SCTP in UDP. Those are not standard IPv6 followed by transport header either, so also must revert to the software GSO path. Cc: stable@vger.kernel.org Fixes: 864e3396976e ("net: gso: Forbid IPv6 TSO with extensions on devices with only IPV6_CSUM") Reported-by: Tangxin Xie Closes: https://lore.kernel.org/netdev/0414e7e2-9a1c-4d7c-a99d-b9039cf68f40@yeah.net/ Suggested-by: Paolo Abeni Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20260320190148.2409107-1-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 14a83f2035b9..fc5557062414 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3769,6 +3769,22 @@ static netdev_features_t dflt_features_check(struct sk_buff *skb, return vlan_features_check(skb, features); } +static bool skb_gso_has_extension_hdr(const struct sk_buff *skb) +{ + if (!skb->encapsulation) + return ((skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr)); + else + return (!skb_inner_network_header_was_set(skb) || + ((skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + inner_ip_hdr(skb)->version == 6)) && + skb_inner_network_header_len(skb) != sizeof(struct ipv6hdr))); +} + static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) @@ -3816,11 +3832,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, * so neither does TSO that depends on it. */ if (features & NETIF_F_IPV6_CSUM && - (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || - (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && - vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && - skb_transport_header_was_set(skb) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_gso_has_extension_hdr(skb)) features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); return features; -- cgit v1.2.3 From 09474055f2619be9445ba4245e4013741ed01a5e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 23 Mar 2026 16:19:43 +0100 Subject: rtnetlink: fix leak of SRCU struct in rtnl_link_register Commit 6b57ff21a310 ("rtnetlink: Protect link_ops by mutex.") swapped the EEXIST check with the init_srcu_struct, but didn't add cleanup of the SRCU struct we just allocated in case of error. Fixes: 6b57ff21a310 ("rtnetlink: Protect link_ops by mutex.") Signed-off-by: Sabrina Dubroca Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/e77fe499f9a58c547b33b5212b3596dad417cec6.1774025341.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4a2278614250..fae8034efbff 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -629,6 +629,9 @@ int rtnl_link_register(struct rtnl_link_ops *ops) unlock: mutex_unlock(&link_ops_mutex); + if (err) + cleanup_srcu_struct(&ops->srcu); + return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); -- cgit v1.2.3 From f39f905e55f529b036321220af1ba4f4085564a5 Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Thu, 19 Mar 2026 17:32:11 +0800 Subject: Bluetooth: L2CAP: Fix send LE flow credits in ACL link When the L2CAP channel mode is L2CAP_MODE_ERTM/L2CAP_MODE_STREAMING, l2cap_publish_rx_avail will be called and le flow credits will be sent in l2cap_chan_rx_avail, even though the link type is ACL. The logs in question as follows: > ACL Data RX: Handle 129 flags 0x02 dlen 12 L2CAP: Unknown (0x16) ident 4 len 4 40 00 ed 05 < ACL Data TX: Handle 129 flags 0x00 dlen 10 L2CAP: Command Reject (0x01) ident 4 len 2 Reason: Command not understood (0x0000) Bluetooth: Unknown BR/EDR signaling command 0x16 Bluetooth: Wrong link type (-22) Fixes: ce60b9231b66 ("Bluetooth: compute LE flow credits based on recvbuf space") Signed-off-by: Zhang Chen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 3de3e3c8e966..6fd884203dc6 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6630,6 +6630,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) struct l2cap_le_credits pkt; u16 return_credits = l2cap_le_rx_credits(chan); + if (chan->mode != L2CAP_MODE_LE_FLOWCTL && + chan->mode != L2CAP_MODE_EXT_FLOWCTL) + return; + if (chan->rx_credits >= return_credits) return; -- cgit v1.2.3 From 00fdebbbc557a2fc21321ff2eaa22fd70c078608 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 20 Mar 2026 20:01:26 +0900 Subject: Bluetooth: L2CAP: Fix deadlock in l2cap_conn_del() l2cap_conn_del() calls cancel_delayed_work_sync() for both info_timer and id_addr_timer while holding conn->lock. However, the work functions l2cap_info_timeout() and l2cap_conn_update_id_addr() both acquire conn->lock, creating a potential AB-BA deadlock if the work is already executing when l2cap_conn_del() takes the lock. Move the work cancellations before acquiring conn->lock and use disable_delayed_work_sync() to additionally prevent the works from being rearmed after cancellation, consistent with the pattern used in hci_conn_del(). Fixes: ab4eedb790ca ("Bluetooth: L2CAP: Fix corrupted list in hci_chan_del") Signed-off-by: Hyunwoo Kim Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 6fd884203dc6..2603c98d7ed1 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1771,6 +1771,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + disable_delayed_work_sync(&conn->info_timer); + disable_delayed_work_sync(&conn->id_addr_timer); + mutex_lock(&conn->lock); kfree_skb(conn->rx_skb); @@ -1786,8 +1789,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) ida_destroy(&conn->tx_ida); - cancel_delayed_work_sync(&conn->id_addr_timer); - l2cap_unregister_all_users(conn); /* Force the connection to be immediately dropped */ @@ -1806,9 +1807,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) - cancel_delayed_work_sync(&conn->info_timer); - hci_chan_del(conn->hchan); conn->hchan = NULL; -- cgit v1.2.3 From 25f420a0d4cfd61d3d23ec4b9c56d9f443d91377 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 20 Mar 2026 20:23:10 +0900 Subject: Bluetooth: L2CAP: Fix ERTM re-init and zero pdu_len infinite loop l2cap_config_req() processes CONFIG_REQ for channels in BT_CONNECTED state to support L2CAP reconfiguration (e.g. MTU changes). However, since both CONF_INPUT_DONE and CONF_OUTPUT_DONE are already set from the initial configuration, the reconfiguration path falls through to l2cap_ertm_init(), which re-initializes tx_q, srej_q, srej_list, and retrans_list without freeing the previous allocations and sets chan->sdu to NULL without freeing the existing skb. This leaks all previously allocated ERTM resources. Additionally, l2cap_parse_conf_req() does not validate the minimum value of remote_mps derived from the RFC max_pdu_size option. A zero value propagates to l2cap_segment_sdu() where pdu_len becomes zero, causing the while loop to never terminate since len is never decremented, exhausting all available memory. Fix the double-init by skipping l2cap_ertm_init() and l2cap_chan_ready() when the channel is already in BT_CONNECTED state, while still allowing the reconfiguration parameters to be updated through l2cap_parse_conf_req(). Also add a pdu_len zero check in l2cap_segment_sdu() as a safeguard. Fixes: 96298f640104 ("Bluetooth: L2CAP: handle l2cap config request during open state") Signed-off-by: Hyunwoo Kim Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2603c98d7ed1..95c65fece39b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2398,6 +2398,9 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, /* Remote device may have requested smaller PDUs */ pdu_len = min_t(size_t, pdu_len, chan->remote_mps); + if (!pdu_len) + return -EINVAL; + if (len <= pdu_len) { sar = L2CAP_SAR_UNSEGMENTED; sdu_len = 0; @@ -4333,14 +4336,16 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) { set_default_fcs(chan); - if (chan->mode == L2CAP_MODE_ERTM || - chan->mode == L2CAP_MODE_STREAMING) - err = l2cap_ertm_init(chan); + if (chan->state != BT_CONNECTED) { + if (chan->mode == L2CAP_MODE_ERTM || + chan->mode == L2CAP_MODE_STREAMING) + err = l2cap_ertm_init(chan); - if (err < 0) - l2cap_send_disconn_req(chan, -err); - else - l2cap_chan_ready(chan); + if (err < 0) + l2cap_send_disconn_req(chan, -err); + else + l2cap_chan_ready(chan); + } goto unlock; } -- cgit v1.2.3 From d3c0037ffe1273fa1961e779ff6906234d6cf53c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 25 Mar 2026 14:10:55 +0100 Subject: netfilter: nft_set_pipapo_avx2: don't return non-matching entry on expiry New test case fails unexpectedly when avx2 matching functions are used. The test first loads a ranomly generated pipapo set with 'ipv4 . port' key, i.e. nft -f foo. This works. Then, it reloads the set after a flush: (echo flush set t s; cat foo) | nft -f - This is expected to work, because its the same set after all and it was already loaded once. But with avx2, this fails: nft reports a clashing element. The reported clash is of following form: We successfully re-inserted a . b c . d Then we try to insert a . d avx2 finds the already existing a . d, which (due to 'flush set') is marked as invalid in the new generation. It skips the element and moves to next. Due to incorrect masking, the skip-step finds the next matching element *only considering the first field*, i.e. we return the already reinserted "a . b", even though the last field is different and the entry should not have been matched. No such error is reported for the generic c implementation (no avx2) or when the last field has to use the 'nft_pipapo_avx2_lookup_slow' fallback. Bisection points to 7711f4bb4b36 ("netfilter: nft_set_pipapo: fix range overlap detection") but that fix merely uncovers this bug. Before this commit, the wrong element is returned, but erronously reported as a full, identical duplicate. The root-cause is too early return in the avx2 match functions. When we process the last field, we should continue to process data until the entire input size has been consumed to make sure no stale bits remain in the map. Link: https://lore.kernel.org/netfilter-devel/20260321152506.037f68c0@elisabeth/ Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 7ff90325c97f..6395982e4d95 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -242,7 +242,7 @@ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -319,7 +319,7 @@ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -414,7 +414,7 @@ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -505,7 +505,7 @@ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -641,7 +641,7 @@ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -699,7 +699,7 @@ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -764,7 +764,7 @@ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -839,7 +839,7 @@ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -925,7 +925,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; @@ -1019,7 +1019,7 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) - return b; + ret = b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; -- cgit v1.2.3 From 84a8335d8300576f1b377ae24abca1d9f197807f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Mar 2026 08:53:23 -0400 Subject: tls: Purge async_hold in tls_decrypt_async_wait() The async_hold queue pins encrypted input skbs while the AEAD engine references their scatterlist data. Once tls_decrypt_async_wait() returns, every AEAD operation has completed and the engine no longer references those skbs, so they can be freed unconditionally. A subsequent patch adds batch async decryption to tls_sw_read_sock(), introducing a new call site that must drain pending AEAD operations and release held skbs. Move __skb_queue_purge(&ctx->async_hold) into tls_decrypt_async_wait() so the purge is centralized and every caller -- recvmsg's drain path, the -EBUSY fallback in tls_do_decryption(), and the new read_sock batch path -- releases held skbs on synchronization without each site managing the purge independently. This fixes a leak when tls_strp_msg_hold() fails part-way through, after having added some cloned skbs to the async_hold queue. tls_decrypt_sg() will then call tls_decrypt_async_wait() to process all pending decrypts, and drop back to synchronous mode, but tls_sw_recvmsg() only flushes the async_hold queue when one record has been processed in "fully-async" mode, which may not be the case here. Signed-off-by: Chuck Lever Reported-by: Yiming Qian Fixes: b8a6ff84abbc ("tls: wait for pending async decryptions if tls_strp_msg_hold fails") Link: https://patch.msgid.link/20260324-tls-read-sock-v5-1-5408befe5774@oracle.com [pabeni@redhat.com: added leak comment] Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5fe07f110fe8..dd9dda759bbb 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -246,6 +246,7 @@ static int tls_decrypt_async_wait(struct tls_sw_context_rx *ctx) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); atomic_inc(&ctx->decrypt_pending); + __skb_queue_purge(&ctx->async_hold); return ctx->async_wait.err; } @@ -2225,7 +2226,6 @@ recv_end: /* Wait for all previously submitted records to be decrypted */ ret = tls_decrypt_async_wait(ctx); - __skb_queue_purge(&ctx->async_hold); if (ret) { if (err >= 0 || err == -EINPROGRESS) -- cgit v1.2.3 From 52025ebaa29f4eb4ed8bf92ce83a68f24ab7fdf7 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 25 Mar 2026 14:10:58 +0100 Subject: netfilter: nfnetlink_log: fix uninitialized padding leak in NFULA_PAYLOAD __build_packet_message() manually constructs the NFULA_PAYLOAD netlink attribute using skb_put() and skb_copy_bits(), bypassing the standard nla_reserve()/nla_put() helpers. While nla_total_size(data_len) bytes are allocated (including NLA alignment padding), only data_len bytes of actual packet data are copied. The trailing nla_padlen(data_len) bytes (1-3 when data_len is not 4-byte aligned) are never initialized, leaking stale heap contents to userspace via the NFLOG netlink socket. Replace the manual attribute construction with nla_reserve(), which handles the tailroom check, header setup, and padding zeroing via __nla_reserve(). The subsequent skb_copy_bits() fills in the payload data on top of the properly initialized attribute. Fixes: df6fb868d611 ("[NETFILTER]: nfnetlink: convert to generic netlink attribute functions") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b35a90955e2e..fcbe54940b2e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -647,15 +647,11 @@ __build_packet_message(struct nfnl_log_net *log, if (data_len) { struct nlattr *nla; - int size = nla_attr_size(data_len); - if (skb_tailroom(inst->skb) < nla_total_size(data_len)) + nla = nla_reserve(inst->skb, NFULA_PAYLOAD, data_len); + if (!nla) goto nla_put_failure; - nla = skb_put(inst->skb, nla_total_size(data_len)); - nla->nla_type = NFULA_PAYLOAD; - nla->nla_len = size; - if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) BUG(); } -- cgit v1.2.3 From 9d3f027327c2fa265f7f85ead41294792c3296ed Mon Sep 17 00:00:00 2001 From: Ren Wei Date: Wed, 25 Mar 2026 14:11:00 +0100 Subject: netfilter: ip6t_rt: reject oversized addrnr in rt_mt6_check() Reject rt match rules whose addrnr exceeds IP6T_RT_HOPS. rt_mt6() expects addrnr to stay within the bounds of rtinfo->addrs[]. Validate addrnr during rule installation so malformed rules are rejected before the match logic can use an out-of-range value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Yuhang Zheng Signed-off-by: Ren Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_rt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 4ad8b2032f1f..5561bd9cea81 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -157,6 +157,10 @@ static int rt_mt6_check(const struct xt_mtchk_param *par) pr_debug("unknown flags %X\n", rtinfo->invflags); return -EINVAL; } + if (rtinfo->addrnr > IP6T_RT_HOPS) { + pr_debug("too many addresses specified\n"); + return -EINVAL; + } if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && (!(rtinfo->flags & IP6T_RT_TYP) || (rtinfo->rt_type != 0) || -- cgit v1.2.3 From fafdd92b9e30fe057740c5bb5cd4f92ecea9bf26 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 14:11:01 +0100 Subject: netfilter: nft_set_rbtree: revisit array resize logic Chris Arges reports high memory consumption with thousands of containers, this patch revisits the array allocation logic. For anonymous sets, start by 16 slots (which takes 256 bytes on x86_64). Expand it by x2 until threshold of 512 slots is reached, over that threshold, expand it by x1.5. For non-anonymous set, start by 1024 slots in the array (which takes 16 Kbytes initially on x86_64). Expand it by x1.5. Use set->ndeact to subtract deactivated elements when calculating the number of the slots in the array, otherwise the array size array gets increased artifically. Add special case shrink logic to deal with flush set too. The shrink logic is skipped by anonymous sets. Use check_add_overflow() to calculate the new array size. Add a WARN_ON_ONCE check to make sure elements fit into the new array size. Reported-by: Chris Arges Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 92 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index fe8bd497d74a..737c339decd0 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -572,14 +572,12 @@ static struct nft_array *nft_array_alloc(u32 max_intervals) return array; } -#define NFT_ARRAY_EXTRA_SIZE 10240 - /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider * packed representation coming from userspace for anonymous sets too. */ static u32 nft_array_elems(const struct nft_set *set) { - u32 nelems = atomic_read(&set->nelems); + u32 nelems = atomic_read(&set->nelems) - set->ndeact; /* Adjacent intervals are represented with a single start element in * anonymous sets, use the current element counter as is. @@ -595,27 +593,87 @@ static u32 nft_array_elems(const struct nft_set *set) return (nelems / 2) + 2; } -static int nft_array_may_resize(const struct nft_set *set) +#define NFT_ARRAY_INITIAL_SIZE 1024 +#define NFT_ARRAY_INITIAL_ANON_SIZE 16 +#define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval)) + +static int nft_array_may_resize(const struct nft_set *set, bool flush) { - u32 nelems = nft_array_elems(set), new_max_intervals; + u32 initial_intervals, max_intervals, new_max_intervals, delta; + u32 shrinked_max_intervals, nelems = nft_array_elems(set); struct nft_rbtree *priv = nft_set_priv(set); struct nft_array *array; - if (!priv->array_next) { - array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); - if (!array) - return -ENOMEM; + if (nft_set_is_anonymous(set)) + initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE; + else + initial_intervals = NFT_ARRAY_INITIAL_SIZE; + + if (priv->array_next) { + max_intervals = priv->array_next->max_intervals; + new_max_intervals = priv->array_next->max_intervals; + } else { + if (priv->array) { + max_intervals = priv->array->max_intervals; + new_max_intervals = priv->array->max_intervals; + } else { + max_intervals = 0; + new_max_intervals = initial_intervals; + } + } - priv->array_next = array; + if (nft_set_is_anonymous(set)) + goto maybe_grow; + + if (flush) { + /* Set flush just started, nelems still report elements.*/ + nelems = 0; + new_max_intervals = NFT_ARRAY_INITIAL_SIZE; + goto realloc_array; } - if (nelems < priv->array_next->max_intervals) - return 0; + if (check_add_overflow(new_max_intervals, new_max_intervals, + &shrinked_max_intervals)) + return -EOVERFLOW; + + shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3); - new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; - if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) + if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE && + nelems < shrinked_max_intervals) { + new_max_intervals = shrinked_max_intervals; + goto realloc_array; + } +maybe_grow: + if (nelems > new_max_intervals) { + if (nft_set_is_anonymous(set) && + new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) { + new_max_intervals <<= 1; + } else { + delta = new_max_intervals >> 1; + if (check_add_overflow(new_max_intervals, delta, + &new_max_intervals)) + return -EOVERFLOW; + } + } + +realloc_array: + if (WARN_ON_ONCE(nelems > new_max_intervals)) return -ENOMEM; + if (priv->array_next) { + if (max_intervals == new_max_intervals) + return 0; + + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) + return -ENOMEM; + } else { + array = nft_array_alloc(new_max_intervals); + if (!array) + return -ENOMEM; + + priv->array_next = array; + } + return 0; } @@ -630,7 +688,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_rbtree_maybe_reset_start_cookie(priv, tstamp); - if (nft_array_may_resize(set) < 0) + if (nft_array_may_resize(set, false) < 0) return -ENOMEM; do { @@ -741,7 +799,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, nft_rbtree_interval_null(set, this)) priv->start_rbe_cookie = 0; - if (nft_array_may_resize(set) < 0) + if (nft_array_may_resize(set, false) < 0) return NULL; while (parent != NULL) { @@ -811,7 +869,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, switch (iter->type) { case NFT_ITER_UPDATE_CLONE: - if (nft_array_may_resize(set) < 0) { + if (nft_array_may_resize(set, true) < 0) { iter->err = -ENOMEM; break; } -- cgit v1.2.3 From 9c42bc9db90a154bc61ae337a070465f3393485a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 14:11:02 +0100 Subject: netfilter: nf_conntrack_expect: honor expectation helper field The expectation helper field is mostly unused. As a result, the netfilter codebase relies on accessing the helper through exp->master. Always set on the expectation helper field so it can be used to reach the helper. nf_ct_expect_init() is called from packet path where the skb owns the ct object, therefore accessing exp->master for the newly created expectation is safe. This saves a lot of updates in all callsites to pass the ct object as parameter to nf_ct_expect_init(). This is a preparation patches for follow up fixes. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 2 +- net/netfilter/nf_conntrack_broadcast.c | 2 +- net/netfilter/nf_conntrack_expect.c | 14 +++++++++++++- net/netfilter/nf_conntrack_h323_main.c | 12 ++++++------ net/netfilter/nf_conntrack_helper.c | 7 ++++++- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nf_conntrack_sip.c | 2 +- 7 files changed, 29 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 165e7a03b8e9..1b01400b10bd 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -40,7 +40,7 @@ struct nf_conntrack_expect { struct nf_conntrack_expect *this); /* Helper to assign to new connection */ - struct nf_conntrack_helper *helper; + struct nf_conntrack_helper __rcu *helper; /* The conntrack of the master connection */ struct nf_conn *master; diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index a7552a46d6ac..1964c596c646 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -70,7 +70,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->expectfn = NULL; exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; - exp->helper = NULL; + rcu_assign_pointer(exp->helper, helper); nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index cfc2daa3fc7f..841e316240da 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -309,12 +309,19 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); +/* This function can only be used from packet path, where accessing + * master's helper is safe, because the packet holds a reference on + * the conntrack object. Never use it from control plane. + */ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { + struct nf_conntrack_helper *helper = NULL; + struct nf_conn *ct = exp->master; + struct nf_conn_help *help; int len; if (family == AF_INET) @@ -325,7 +332,12 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, exp->flags = 0; exp->class = class; exp->expectfn = NULL; - exp->helper = NULL; + + help = nfct_help(ct); + if (help) + helper = rcu_dereference(help->helper); + + rcu_assign_pointer(exp->helper, helper); exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index a2a0e22ccee1..3f5c50455b71 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - exp->helper = &nf_conntrack_helper_h245; + rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - exp->helper = nf_conntrack_helper_q931; + rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - exp->helper = nf_conntrack_helper_q931; + rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - exp->helper = nf_conntrack_helper_ras; + rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - exp->helper = nf_conntrack_helper_q931; + rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - exp->helper = nf_conntrack_helper_q931; + rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ceb48c3ca0a4..294a6ffcbccd 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -399,7 +399,7 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) const struct nf_conntrack_helper *me = data; const struct nf_conntrack_helper *this; - if (exp->helper == me) + if (rcu_access_pointer(exp->helper) == me) return true; this = rcu_dereference_protected(help->helper, @@ -421,6 +421,11 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); + + /* nf_ct_iterate_destroy() does an unconditional synchronize_rcu() as + * last step, this ensures rcu readers of exp->helper are done. + * No need for another synchronize_rcu() here. + */ } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c156574e1273..a42d14290786 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3573,7 +3573,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->class = class; exp->master = ct; - exp->helper = helper; + rcu_assign_pointer(exp->helper, helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 4ab5ef71d96d..106b2f419e19 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1297,7 +1297,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - exp->helper = helper; + rcu_assign_pointer(exp->helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); -- cgit v1.2.3 From f01794106042ee27e54af6fdf5b319a2fe3df94d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 14:11:03 +0100 Subject: netfilter: nf_conntrack_expect: use expect->helper Use expect->helper in ctnetlink and /proc to dump the helper name. Using nfct_help() without holding a reference to the master conntrack is unsafe. Use exp->master->helper in ctnetlink path if userspace does not provide an explicit helper when creating an expectation to retain the existing behaviour. The ctnetlink expectation path holds the reference on the master conntrack and nf_conntrack_expect lock and the nfnetlink glue path refers to the master ct that is attached to the skb. Reported-by: Hyunwoo Kim Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_helper.c | 6 +----- net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++-------------- net/netfilter/nf_conntrack_sip.c | 2 +- 4 files changed, 13 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 841e316240da..64977db12b1d 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -666,7 +666,7 @@ static int exp_seq_show(struct seq_file *s, void *v) if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); - helper = rcu_dereference(nfct_help(expect->master)->helper); + helper = rcu_dereference(expect->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 294a6ffcbccd..1b330ba6613b 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -395,14 +395,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) { - struct nf_conn_help *help = nfct_help(exp->master); const struct nf_conntrack_helper *me = data; const struct nf_conntrack_helper *this; - if (rcu_access_pointer(exp->helper) == me) - return true; - - this = rcu_dereference_protected(help->helper, + this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a42d14290786..8477c3736432 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3012,7 +3012,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, { struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; - struct nf_conn_help *help; + struct nf_conntrack_helper *helper; #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; @@ -3057,15 +3057,12 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; - help = nfct_help(master); - if (help) { - struct nf_conntrack_helper *helper; - helper = rcu_dereference(help->helper); - if (helper && - nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) - goto nla_put_failure; - } + helper = rcu_dereference(exp->helper); + if (helper && + nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) + goto nla_put_failure; + expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); if (expfn != NULL && nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) @@ -3394,12 +3391,9 @@ static int ctnetlink_get_expect(struct sk_buff *skb, static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) { struct nf_conntrack_helper *helper; - const struct nf_conn_help *m_help; const char *name = data; - m_help = nfct_help(exp->master); - - helper = rcu_dereference(m_help->helper); + helper = rcu_dereference(exp->helper); if (!helper) return false; @@ -3534,9 +3528,9 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { - u_int32_t class = 0; struct nf_conntrack_expect *exp; struct nf_conn_help *help; + u32 class = 0; int err; help = nfct_help(ct); @@ -3573,6 +3567,8 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->class = class; exp->master = ct; + if (!helper) + helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 106b2f419e19..20e57cf5c83a 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -924,7 +924,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); if (!exp || exp->master == ct || - nfct_help(exp->master)->helper != nfct_help(ct)->helper || + exp->helper != nfct_help(ct)->helper || exp->class != class) break; #if IS_ENABLED(CONFIG_NF_NAT) -- cgit v1.2.3 From bffcaad9afdfe45d7fc777397d3b83c1e3ebffe5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 14:11:04 +0100 Subject: netfilter: ctnetlink: ensure safe access to master conntrack Holding reference on the expectation is not sufficient, the master conntrack object can just go away, making exp->master invalid. To access exp->master safely: - Grab the nf_conntrack_expect_lock, this gets serialized with clean_from_lists() which also holds this lock when the master conntrack goes away. - Hold reference on master conntrack via nf_conntrack_find_get(). Not so easy since the master tuple to look up for the master conntrack is not available in the existing problematic paths. This patch goes for extending the nf_conntrack_expect_lock section to address this issue for simplicity, in the cases that are described below this is just slightly extending the lock section. The add expectation command already holds a reference to the master conntrack from ctnetlink_create_expect(). However, the delete expectation command needs to grab the spinlock before looking up for the expectation. Expand the existing spinlock section to address this to cover the expectation lookup. Note that, the nf_ct_expect_iterate_net() calls already grabs the spinlock while iterating over the expectation table, which is correct. The get expectation command needs to grab the spinlock to ensure master conntrack does not go away. This also expands the existing spinlock section to cover the expectation lookup too. I needed to move the netlink skb allocation out of the spinlock to keep it GFP_KERNEL. For the expectation events, the IPEXP_DESTROY event is already delivered under the spinlock, just move the delivery of IPEXP_NEW under the spinlock too because the master conntrack event cache is reached through exp->master. While at it, add lockdep notations to help identify what codepaths need to grab the spinlock. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 5 +++++ net/netfilter/nf_conntrack_ecache.c | 2 ++ net/netfilter/nf_conntrack_expect.c | 10 +++++++++- net/netfilter/nf_conntrack_netlink.c | 28 +++++++++++++++++++--------- 4 files changed, 35 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3384859a8921..8883575adcc1 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; +static inline void lockdep_nfct_expect_lock_held(void) +{ + lockdep_assert_held(&nf_conntrack_expect_lock); +} + /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 81baf2082604..9df159448b89 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -247,6 +247,8 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; + lockdep_nfct_expect_lock_held(); + rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 64977db12b1d..1cbe5f1108c2 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -51,6 +51,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; + lockdep_nfct_expect_lock_held(); WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); @@ -118,6 +119,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { + lockdep_nfct_expect_lock_held(); + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); @@ -177,6 +180,8 @@ nf_ct_find_expectation(struct net *net, struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; + lockdep_nfct_expect_lock_held(); + if (!cnet->expect_count) return NULL; @@ -454,6 +459,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int h; int ret = 0; + lockdep_nfct_expect_lock_held(); + if (!master_help) { ret = -ESHUTDOWN; goto out; @@ -510,8 +517,9 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, nf_ct_expect_insert(expect); - spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); + spin_unlock_bh(&nf_conntrack_expect_lock); + return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8477c3736432..89540112d165 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3355,31 +3355,37 @@ static int ctnetlink_get_expect(struct sk_buff *skb, if (err < 0) return err; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_expect_find_get(info->net, &zone, &tuple); - if (!exp) + if (!exp) { + spin_unlock_bh(&nf_conntrack_expect_lock); + kfree_skb(skb2); return -ENOENT; + } if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); + spin_unlock_bh(&nf_conntrack_expect_lock); + kfree_skb(skb2); return -ENOENT; } } - skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb2) { - nf_ct_expect_put(exp); - return -ENOMEM; - } - rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); + spin_unlock_bh(&nf_conntrack_expect_lock); + if (err <= 0) { kfree_skb(skb2); return -ENOMEM; @@ -3426,22 +3432,26 @@ static int ctnetlink_del_expect(struct sk_buff *skb, if (err < 0) return err; + spin_lock_bh(&nf_conntrack_expect_lock); + /* bump usage count to 2 */ exp = nf_ct_expect_find_get(info->net, &zone, &tuple); - if (!exp) + if (!exp) { + spin_unlock_bh(&nf_conntrack_expect_lock); return -ENOENT; + } if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); + spin_unlock_bh(&nf_conntrack_expect_lock); return -ENOENT; } } /* after list removal, usage count == 1 */ - spin_lock_bh(&nf_conntrack_expect_lock); if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); -- cgit v1.2.3 From 02a3231b6d82efe750da6554ebf280e4a6f78756 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 22:39:55 +0100 Subject: netfilter: nf_conntrack_expect: store netns and zone in expectation __nf_ct_expect_find() and nf_ct_expect_find_get() are called under rcu_read_lock() but they dereference the master conntrack via exp->master. Since the expectation does not hold a reference on the master conntrack, this could be dying conntrack or different recycled conntrack than the real master due to SLAB_TYPESAFE_RCU. Store the netns, the master_tuple and the zone in struct nf_conntrack_expect as a safety measure. This patch is required by the follow up fix not to dump expectations that do not belong to this netns. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 18 +++++++++++++++++- net/netfilter/nf_conntrack_broadcast.c | 6 +++++- net/netfilter/nf_conntrack_expect.c | 9 +++++++-- net/netfilter/nf_conntrack_netlink.c | 5 +++++ 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 1b01400b10bd..e9a8350e7ccf 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -22,10 +22,16 @@ struct nf_conntrack_expect { /* Hash member */ struct hlist_node hnode; + /* Network namespace */ + possible_net_t net; + /* We expect this tuple, with the following mask */ struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; +#ifdef CONFIG_NF_CONNTRACK_ZONES + struct nf_conntrack_zone zone; +#endif /* Usage count. */ refcount_t use; @@ -62,7 +68,17 @@ struct nf_conntrack_expect { static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { - return nf_ct_net(exp->master); + return read_pnet(&exp->net); +} + +static inline bool nf_ct_exp_zone_equal_any(const struct nf_conntrack_expect *a, + const struct nf_conntrack_zone *b) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + return a->zone.id == b->id; +#else + return true; +#endif } #define NF_CT_EXP_POLICY_NAME_LEN 16 diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 1964c596c646..4f39bf7c843f 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -21,6 +21,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, unsigned int timeout) { const struct nf_conntrack_helper *helper; + struct net *net = read_pnet(&ct->ct_net); struct nf_conntrack_expect *exp; struct iphdr *iph = ip_hdr(skb); struct rtable *rt = skb_rtable(skb); @@ -71,7 +72,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); - + write_pnet(&exp->net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + exp->zone = ct->zone; +#endif nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 1cbe5f1108c2..db28801b1688 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -113,8 +113,8 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - net_eq(net, nf_ct_net(i->master)) && - nf_ct_zone_equal_any(i->master, zone); + net_eq(net, read_pnet(&i->net)) && + nf_ct_exp_zone_equal_any(i, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) @@ -326,6 +326,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, { struct nf_conntrack_helper *helper = NULL; struct nf_conn *ct = exp->master; + struct net *net = read_pnet(&ct->ct_net); struct nf_conn_help *help; int len; @@ -343,6 +344,10 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + write_pnet(&exp->net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + exp->zone = ct->zone; +#endif exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 89540112d165..6e6aeb0ab0a1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3538,6 +3538,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { + struct net *net = read_pnet(&ct->ct_net); struct nf_conntrack_expect *exp; struct nf_conn_help *help; u32 class = 0; @@ -3577,6 +3578,10 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->class = class; exp->master = ct; + write_pnet(&exp->net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + exp->zone = ct->zone; +#endif if (!helper) helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); -- cgit v1.2.3 From 3db5647984de03d9cae0dcddb509b058351f0ee4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Mar 2026 14:11:06 +0100 Subject: netfilter: nf_conntrack_expect: skip expectations in other netns via proc Skip expectations that do not reside in this netns. Similar to e77e6ff502ea ("netfilter: conntrack: do not dump other netns's conntrack entries via proc"). Fixes: 9b03f38d0487 ("netfilter: netns nf_conntrack: per-netns expectations") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index db28801b1688..24d0576d84b7 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -652,11 +652,15 @@ static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; + struct net *net = seq_file_net(s); struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); + if (!net_eq(nf_ct_exp_net(expect), net)) + return 0; + if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); -- cgit v1.2.3 From 6a2b724460cb67caed500c508c2ae5cf012e4db4 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 25 Mar 2026 14:11:07 +0100 Subject: netfilter: nf_conntrack_sip: fix use of uninitialized rtp_addr in process_sdp process_sdp() declares union nf_inet_addr rtp_addr on the stack and passes it to the nf_nat_sip sdp_session hook after walking the SDP media descriptions. However rtp_addr is only initialized inside the media loop when a recognized media type with a non-zero port is found. If the SDP body contains no m= lines, only inactive media sections (m=audio 0 ...) or only unrecognized media types, rtp_addr is never assigned. Despite that, the function still calls hooks->sdp_session() with &rtp_addr, causing nf_nat_sdp_session() to format the stale stack value as an IP address and rewrite the SDP session owner and connection lines with it. With CONFIG_INIT_STACK_ALL_ZERO (default on most distributions) this results in the session-level o= and c= addresses being rewritten to 0.0.0.0 for inactive SDP sessions. Without stack auto-init the rewritten address is whatever happened to be on the stack. Fix this by pre-initializing rtp_addr from the session-level connection address (caddr) when available, and tracking via a have_rtp_addr flag whether any valid address was established. Skip the sdp_session hook entirely when no valid address exists. Fixes: 4ab9e64e5e3c ("[NETFILTER]: nf_nat_sip: split up SDP mangling") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 20e57cf5c83a..939502ff7c87 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1040,6 +1040,7 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, unsigned int port; const struct sdp_media_type *t; int ret = NF_ACCEPT; + bool have_rtp_addr = false; hooks = rcu_dereference(nf_nat_sip_hooks); @@ -1056,8 +1057,11 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, caddr_len = 0; if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, SDP_HDR_CONNECTION, SDP_HDR_MEDIA, - &matchoff, &matchlen, &caddr) > 0) + &matchoff, &matchlen, &caddr) > 0) { caddr_len = matchlen; + memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); + have_rtp_addr = true; + } mediaoff = sdpoff; for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) { @@ -1091,9 +1095,11 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, &matchoff, &matchlen, &maddr) > 0) { maddr_len = matchlen; memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); - } else if (caddr_len) + have_rtp_addr = true; + } else if (caddr_len) { memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); - else { + have_rtp_addr = true; + } else { nf_ct_helper_log(skb, ct, "cannot parse SDP message"); return NF_DROP; } @@ -1125,7 +1131,7 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, /* Update session connection and owner addresses */ hooks = rcu_dereference(nf_nat_sip_hooks); - if (hooks && ct->status & IPS_NAT_MASK) + if (hooks && ct->status & IPS_NAT_MASK && have_rtp_addr) ret = hooks->sdp_session(skb, protoff, dataoff, dptr, datalen, sdpoff, &rtp_addr); -- cgit v1.2.3 From 8f15b5071b4548b0aafc03b366eb45c9c6566704 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Wed, 25 Mar 2026 14:11:08 +0100 Subject: netfilter: ctnetlink: use netlink policy range checks Replace manual range and mask validations with netlink policy annotations in ctnetlink code paths, so that the netlink core rejects invalid values early and can generate extack errors. - CTA_PROTOINFO_TCP_STATE: reject values > TCP_CONNTRACK_SYN_SENT2 at policy level, removing the manual >= TCP_CONNTRACK_MAX check. - CTA_PROTOINFO_TCP_WSCALE_ORIGINAL/REPLY: reject values > TCP_MAX_WSCALE (14). The normal TCP option parsing path already clamps to this value, but the ctnetlink path accepted 0-255, causing undefined behavior when used as a u32 shift count. - CTA_FILTER_ORIG_FLAGS/REPLY_FLAGS: use NLA_POLICY_MASK with CTA_FILTER_F_ALL, removing the manual mask checks. - CTA_EXPECT_FLAGS: use NLA_POLICY_MASK with NF_CT_EXPECT_MASK, adding a new mask define grouping all valid expect flags. Extracted from a broader nf-next patch by Florian Westphal, scoped to ctnetlink for the fixes tree. Fixes: c8e2078cfe41 ("[NETFILTER]: ctnetlink: add support for internal tcp connection tracking flags handling") Signed-off-by: David Carlier Co-developed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 4 ++++ net/netfilter/nf_conntrack_netlink.c | 16 +++++----------- net/netfilter/nf_conntrack_proto_tcp.c | 10 +++------- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 26071021e986..56b6b60a814f 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -159,5 +159,9 @@ enum ip_conntrack_expect_events { #define NF_CT_EXPECT_INACTIVE 0x2 #define NF_CT_EXPECT_USERSPACE 0x4 +#ifdef __KERNEL__ +#define NF_CT_EXPECT_MASK (NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE | \ + NF_CT_EXPECT_USERSPACE) +#endif #endif /* _UAPI_NF_CONNTRACK_COMMON_H */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6e6aeb0ab0a1..3f408f3713bb 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -910,8 +910,8 @@ struct ctnetlink_filter { }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { - [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, - [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, + [CTA_FILTER_ORIG_FLAGS] = NLA_POLICY_MASK(NLA_U32, CTA_FILTER_F_ALL), + [CTA_FILTER_REPLY_FLAGS] = NLA_POLICY_MASK(NLA_U32, CTA_FILTER_F_ALL), }; static int ctnetlink_parse_filter(const struct nlattr *attr, @@ -925,17 +925,11 @@ static int ctnetlink_parse_filter(const struct nlattr *attr, if (ret) return ret; - if (tb[CTA_FILTER_ORIG_FLAGS]) { + if (tb[CTA_FILTER_ORIG_FLAGS]) filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); - if (filter->orig_flags & ~CTA_FILTER_F_ALL) - return -EOPNOTSUPP; - } - if (tb[CTA_FILTER_REPLY_FLAGS]) { + if (tb[CTA_FILTER_REPLY_FLAGS]) filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); - if (filter->reply_flags & ~CTA_FILTER_F_ALL) - return -EOPNOTSUPP; - } return 0; } @@ -2634,7 +2628,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, - [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_CT_EXPECT_MASK), [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 0c1d086e96cb..b67426c2189b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1385,9 +1385,9 @@ nla_put_failure: } static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { - [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, - [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_STATE] = NLA_POLICY_MAX(NLA_U8, TCP_CONNTRACK_SYN_SENT2), + [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = NLA_POLICY_MAX(NLA_U8, TCP_MAX_WSCALE), + [CTA_PROTOINFO_TCP_WSCALE_REPLY] = NLA_POLICY_MAX(NLA_U8, TCP_MAX_WSCALE), [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; @@ -1414,10 +1414,6 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) if (err < 0) return err; - if (tb[CTA_PROTOINFO_TCP_STATE] && - nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) - return -EINVAL; - spin_lock_bh(&ct->lock); if (tb[CTA_PROTOINFO_TCP_STATE]) ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); -- cgit v1.2.3 From 629ec78ef8608d955ce217880cdc3e1873af3a15 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 24 Mar 2026 00:25:57 +0100 Subject: mpls: add seqcount to protect the platform_label{,s} pair The RCU-protected codepaths (mpls_forward, mpls_dump_routes) can have an inconsistent view of platform_labels vs platform_label in case of a concurrent resize (resize_platform_label_table, under platform_mutex). This can lead to OOB accesses. This patch adds a seqcount, so that we get a consistent snapshot. Note that mpls_label_ok is also susceptible to this, so the check against RTA_DST in rtm_to_route_config, done outside platform_mutex, is not sufficient. This value gets passed to mpls_label_ok once more in both mpls_route_add and mpls_route_del, so there is no issue, but that additional check must not be removed. Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Fixes: 7720c01f3f590 ("mpls: Add a sysctl to control the size of the mpls label table") Fixes: dde1b38e873c ("mpls: Convert mpls_dump_routes() to RCU.") Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/cd8fca15e3eb7e212b094064cd83652e20fd9d31.1774284088.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- include/net/netns/mpls.h | 1 + net/mpls/af_mpls.c | 29 +++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index 6682e51513ef..2073cbac2afb 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -17,6 +17,7 @@ struct netns_mpls { size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; struct mutex platform_mutex; + seqcount_mutex_t platform_label_seq; struct ctl_table_header *ctl; }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d5417688f69e..18d3da8ab384 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -83,14 +83,30 @@ static struct mpls_route *mpls_route_input(struct net *net, unsigned int index) return mpls_dereference(net, platform_label[index]); } +static struct mpls_route __rcu **mpls_platform_label_rcu(struct net *net, size_t *platform_labels) +{ + struct mpls_route __rcu **platform_label; + unsigned int sequence; + + do { + sequence = read_seqcount_begin(&net->mpls.platform_label_seq); + platform_label = rcu_dereference(net->mpls.platform_label); + *platform_labels = net->mpls.platform_labels; + } while (read_seqcount_retry(&net->mpls.platform_label_seq, sequence)); + + return platform_label; +} + static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index) { struct mpls_route __rcu **platform_label; + size_t platform_labels; + + platform_label = mpls_platform_label_rcu(net, &platform_labels); - if (index >= net->mpls.platform_labels) + if (index >= platform_labels) return NULL; - platform_label = rcu_dereference(net->mpls.platform_label); return rcu_dereference(platform_label[index]); } @@ -2240,8 +2256,7 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; - platform_label = rcu_dereference(net->mpls.platform_label); - platform_labels = net->mpls.platform_labels; + platform_label = mpls_platform_label_rcu(net, &platform_labels); if (filter.filter_set) flags |= NLM_F_DUMP_FILTERED; @@ -2645,8 +2660,12 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* Update the global pointers */ + local_bh_disable(); + write_seqcount_begin(&net->mpls.platform_label_seq); net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); + write_seqcount_end(&net->mpls.platform_label_seq); + local_bh_enable(); mutex_unlock(&net->mpls.platform_mutex); @@ -2728,6 +2747,8 @@ static __net_init int mpls_net_init(struct net *net) int i; mutex_init(&net->mpls.platform_mutex); + seqcount_mutex_init(&net->mpls.platform_label_seq, &net->mpls.platform_mutex); + net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; -- cgit v1.2.3 From 2428083101f6883f979cceffa76cd8440751ffe6 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 24 Mar 2026 16:06:44 +0800 Subject: net: qrtr: replace qrtr_tx_flow radix_tree with xarray to fix memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __radix_tree_create() allocates and links intermediate nodes into the tree one by one. If a subsequent allocation fails, the already-linked nodes remain in the tree with no corresponding leaf entry. These orphaned internal nodes are never reclaimed because radix_tree_for_each_slot() only visits slots containing leaf values. The radix_tree API is deprecated in favor of xarray. As suggested by Matthew Wilcox, migrate qrtr_tx_flow from radix_tree to xarray instead of fixing the radix_tree itself [1]. xarray properly handles cleanup of internal nodes — xa_destroy() frees all internal xarray nodes when the qrtr_node is released, preventing the leak. [1] https://lore.kernel.org/all/20260225071623.41275-1-jiayuan.chen@linux.dev/T/ Reported-by: syzbot+006987d1be3586e13555@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000bfba3a060bf4ffcf@google.com/T/ Fixes: 5fdeb0d372ab ("net: qrtr: Implement outgoing flow control") Signed-off-by: Jiayuan Chen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260324080645.290197-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/qrtr/af_qrtr.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 55fd2dd37588..d77e9c8212da 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -118,7 +118,7 @@ static DEFINE_XARRAY_ALLOC(qrtr_ports); * @ep: endpoint * @ref: reference count for node * @nid: node id - * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port + * @qrtr_tx_flow: xarray of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @item: list item for broadcast list @@ -129,7 +129,7 @@ struct qrtr_node { struct kref ref; unsigned int nid; - struct radix_tree_root qrtr_tx_flow; + struct xarray qrtr_tx_flow; struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; @@ -172,6 +172,7 @@ static void __qrtr_node_release(struct kref *kref) struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; + unsigned long index; spin_lock_irqsave(&qrtr_nodes_lock, flags); /* If the node is a bridge for other nodes, there are possibly @@ -189,11 +190,9 @@ static void __qrtr_node_release(struct kref *kref) skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ - radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { - flow = *slot; - radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); + xa_for_each(&node->qrtr_tx_flow, index, flow) kfree(flow); - } + xa_destroy(&node->qrtr_tx_flow); kfree(node); } @@ -228,9 +227,7 @@ static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) key = remote_node << 32 | remote_port; - rcu_read_lock(); - flow = radix_tree_lookup(&node->qrtr_tx_flow, key); - rcu_read_unlock(); + flow = xa_load(&node->qrtr_tx_flow, key); if (flow) { spin_lock(&flow->resume_tx.lock); flow->pending = 0; @@ -269,12 +266,13 @@ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, return 0; mutex_lock(&node->qrtr_tx_lock); - flow = radix_tree_lookup(&node->qrtr_tx_flow, key); + flow = xa_load(&node->qrtr_tx_flow, key); if (!flow) { flow = kzalloc_obj(*flow); if (flow) { init_waitqueue_head(&flow->resume_tx); - if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { + if (xa_err(xa_store(&node->qrtr_tx_flow, key, flow, + GFP_KERNEL))) { kfree(flow); flow = NULL; } @@ -326,9 +324,7 @@ static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; - rcu_read_lock(); - flow = radix_tree_lookup(&node->qrtr_tx_flow, key); - rcu_read_unlock(); + flow = xa_load(&node->qrtr_tx_flow, key); if (flow) { spin_lock_irq(&flow->resume_tx.lock); flow->tx_failed = 1; @@ -599,7 +595,7 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) node->nid = QRTR_EP_NID_AUTO; node->ep = ep; - INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); + xa_init(&node->qrtr_tx_flow); mutex_init(&node->qrtr_tx_lock); qrtr_node_assign(node, nid); @@ -627,6 +623,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) struct qrtr_tx_flow *flow; struct sk_buff *skb; unsigned long flags; + unsigned long index; void __rcu **slot; mutex_lock(&node->ep_lock); @@ -649,10 +646,8 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); - radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { - flow = *slot; + xa_for_each(&node->qrtr_tx_flow, index, flow) wake_up_interruptible_all(&flow->resume_tx); - } mutex_unlock(&node->qrtr_tx_lock); qrtr_node_release(node); -- cgit v1.2.3 From ae05340ccaa9d347fe85415609e075545bec589f Mon Sep 17 00:00:00 2001 From: Yochai Eisenrich Date: Wed, 25 Mar 2026 00:49:25 +0200 Subject: net: ipv6: ndisc: fix ndisc_ra_useropt to initialize nduseropt_padX fields to zero to prevent an info-leak When processing Router Advertisements with user options the kernel builds an RTM_NEWNDUSEROPT netlink message. The nduseroptmsg struct has three padding fields that are never zeroed and can leak kernel data The fix is simple, just zeroes the padding fields. Fixes: 31910575a9de ("[IPv6]: Export userland ND options through netlink (RDNSS support)") Signed-off-by: Yochai Eisenrich Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260324224925.2437775-1-echelonh@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f6a5d8c73af9..186e60c79214 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1209,6 +1209,9 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; + ndmsg->nduseropt_pad1 = 0; + ndmsg->nduseropt_pad2 = 0; + ndmsg->nduseropt_pad3 = 0; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); -- cgit v1.2.3 From 5e67ba9bb531e1ec6599a82a065dea9040b9ce50 Mon Sep 17 00:00:00 2001 From: Pengpeng Hou Date: Wed, 25 Mar 2026 15:41:52 +0800 Subject: net/ipv6: ioam6: prevent schema length wraparound in trace fill ioam6_fill_trace_data() stores the schema contribution to the trace length in a u8. With bit 22 enabled and the largest schema payload, sclen becomes 1 + 1020 / 4, wraps from 256 to 0, and bypasses the remaining-space check. __ioam6_fill_trace_data() then positions the write cursor without reserving the schema area but still copies the 4-byte schema header and the full schema payload, overrunning the trace buffer. Keep sclen in an unsigned int so the remaining-space check and the write cursor calculation both see the full schema length. Fixes: 8c6f6fa67726 ("ipv6: ioam: IOAM Generic Netlink API") Signed-off-by: Pengpeng Hou Reviewed-by: Justin Iurman Signed-off-by: David S. Miller --- net/ipv6/ioam6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index b76f89d92e7b..3978773bec42 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -708,7 +708,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, struct ioam6_schema *sc, - u8 sclen, bool is_input) + unsigned int sclen, bool is_input) { struct net_device *dev = skb_dst_dev(skb); struct timespec64 ts; @@ -939,7 +939,7 @@ void ioam6_fill_trace_data(struct sk_buff *skb, bool is_input) { struct ioam6_schema *sc; - u8 sclen = 0; + unsigned int sclen = 0; /* Skip if Overflow flag is set */ -- cgit v1.2.3 From 86ab3e55673a7a49a841838776f1ab18d23a67b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Mar 2026 20:26:08 +0000 Subject: ipv6: icmp: clear skb2->cb[] in ip6_err_gen_icmpv6_unreach() Sashiko AI-review observed: In ip6_err_gen_icmpv6_unreach(), the skb is an outer IPv4 ICMP error packet where its cb contains an IPv4 inet_skb_parm. When skb is cloned into skb2 and passed to icmp6_send(), it uses IP6CB(skb2). IP6CB interprets the IPv4 inet_skb_parm as an inet6_skb_parm. The cipso offset in inet_skb_parm.opt directly overlaps with dsthao in inet6_skb_parm at offset 18. If an attacker sends a forged ICMPv4 error with a CIPSO IP option, dsthao would be a non-zero offset. Inside icmp6_send(), mip6_addr_swap() is called and uses ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO). This would scan the inner, attacker-controlled IPv6 packet starting at that offset, potentially returning a fake TLV without checking if the remaining packet length can hold the full 18-byte struct ipv6_destopt_hao. Could mip6_addr_swap() then perform a 16-byte swap that extends past the end of the packet data into skb_shared_info? Should the cb array also be cleared in ip6_err_gen_icmpv6_unreach() and ip6ip6_err() to prevent this? This patch implements the first suggestion. I am not sure if ip6ip6_err() needs to be changed. A separate patch would be better anyway. Fixes: ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error") Reported-by: Ido Schimmel Closes: https://sashiko.dev/#/patchset/20260326155138.2429480-1-edumazet%40google.com Signed-off-by: Eric Dumazet Cc: Oskar Kjos Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260326202608.2976021-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 813d2e9edb8b..d5d23a9296ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -875,6 +875,9 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, if (!skb2) return 1; + /* Remove debris left by IPv4 stack. */ + memset(IP6CB(skb2), 0, sizeof(*IP6CB(skb2))); + skb_dst_drop(skb2); skb_pull(skb2, nhs); skb_reset_network_header(skb2); -- cgit v1.2.3 From 2edfa31769a4add828a7e604b21cb82aaaa05925 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Mar 2026 15:51:38 +0000 Subject: ip6_tunnel: clear skb2->cb[] in ip4ip6_err() Oskar Kjos reported the following problem. ip4ip6_err() calls icmp_send() on a cloned skb whose cb[] was written by the IPv6 receive path as struct inet6_skb_parm. icmp_send() passes IPCB(skb2) to __ip_options_echo(), which interprets that cb[] region as struct inet_skb_parm (IPv4). The layouts differ: inet6_skb_parm.nhoff at offset 14 overlaps inet_skb_parm.opt.rr, producing a non-zero rr value. __ip_options_echo() then reads optlen from attacker-controlled packet data at sptr[rr+1] and copies that many bytes into dopt->__data, a fixed 40-byte stack buffer (IP_OPTIONS_DATA_FIXED_SIZE). To fix this we clear skb2->cb[], as suggested by Oskar Kjos. Also add minimal IPv4 header validation (version == 4, ihl >= 5). Fixes: c4d3efafcc93 ("[IPV6] IP6TUNNEL: Add support to IPv4 over IPv6 tunnel.") Reported-by: Oskar Kjos Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260326155138.2429480-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_tunnel.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4c29aa94e86e..0b53488a9229 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -601,11 +601,16 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!skb2) return 0; + /* Remove debris left by IPv6 stack. */ + memset(IPCB(skb2), 0, sizeof(*IPCB(skb2))); + skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); + if (eiph->version != 4 || eiph->ihl < 5) + goto out; /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, -- cgit v1.2.3 From a01aee7cafc575bb82f5529e8734e7052f9b16ea Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 26 Mar 2026 03:44:39 +0000 Subject: bridge: br_nd_send: linearize skb before parsing ND options br_nd_send() parses neighbour discovery options from ns->opt[] and assumes that these options are in the linear part of request. Its callers only guarantee that the ICMPv6 header and target address are available, so the option area can still be non-linear. Parsing ns->opt[] in that case can access data past the linear buffer. Linearize request before option parsing and derive ns from the linear network header. Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Reported-by: Yifan Wu Reported-by: Juefei Pu Tested-by: Ao Zhou Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Yang Yang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260326034441.2037420-2-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/bridge/br_arp_nd_proxy.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 1e2b51769eec..af3d1e33f50b 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -251,12 +251,12 @@ struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg) static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, struct sk_buff *request, struct neighbour *n, - __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns) + __be16 vlan_proto, u16 vlan_tci) { struct net_device *dev = request->dev; struct net_bridge_vlan_group *vg; + struct nd_msg *na, *ns; struct sk_buff *reply; - struct nd_msg *na; struct ipv6hdr *pip6; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; @@ -264,7 +264,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, u8 *daddr; u16 pvid; - if (!dev) + if (!dev || skb_linearize(request)) return; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + @@ -281,6 +281,8 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, skb_set_mac_header(reply, 0); daddr = eth_hdr(request)->h_source; + ns = (struct nd_msg *)(skb_network_header(request) + + sizeof(struct ipv6hdr)); /* Do we need option processing ? */ ns_olen = request->len - (skb_network_offset(request) + @@ -472,9 +474,9 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (vid != 0) br_nd_send(br, p, skb, n, skb->vlan_proto, - skb_vlan_tag_get(skb), msg); + skb_vlan_tag_get(skb)); else - br_nd_send(br, p, skb, n, 0, 0, msg); + br_nd_send(br, p, skb, n, 0, 0); replied = true; } -- cgit v1.2.3 From 850837965af15707fd3142c1cf3c5bfaf022299b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 26 Mar 2026 03:44:40 +0000 Subject: bridge: br_nd_send: validate ND option lengths br_nd_send() walks ND options according to option-provided lengths. A malformed option can make the parser advance beyond the computed option span or use a too-short source LLADDR option payload. Validate option lengths against the remaining NS option area before advancing, and only read source LLADDR when the option is large enough for an Ethernet address. Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Cc: stable@vger.kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Tested-by: Ao Zhou Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Yang Yang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260326034441.2037420-3-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/bridge/br_arp_nd_proxy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index af3d1e33f50b..6b5595868a39 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -288,12 +288,14 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, ns_olen = request->len - (skb_network_offset(request) + sizeof(struct ipv6hdr)) - sizeof(*ns); for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { - if (!ns->opt[i + 1]) { + if (!ns->opt[i + 1] || i + (ns->opt[i + 1] << 3) > ns_olen) { kfree_skb(reply); return; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { - daddr = ns->opt + i + sizeof(struct nd_opt_hdr); + if ((ns->opt[i + 1] << 3) >= + sizeof(struct nd_opt_hdr) + ETH_ALEN) + daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } -- cgit v1.2.3 From 4576100b8cd03118267513cafacde164b498b322 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Thu, 26 Mar 2026 13:43:09 -0700 Subject: net/sched: sch_hfsc: fix divide-by-zero in rtsc_min() m2sm() converts a u32 slope to a u64 scaled value. For large inputs (e.g. m1=4000000000), the result can reach 2^32. rtsc_min() stores the difference of two such u64 values in a u32 variable `dsm` and uses it as a divisor. When the difference is exactly 2^32 the truncation yields zero, causing a divide-by-zero oops in the concave-curve intersection path: Oops: divide error: 0000 RIP: 0010:rtsc_min (net/sched/sch_hfsc.c:601) Call Trace: init_ed (net/sched/sch_hfsc.c:629) hfsc_enqueue (net/sched/sch_hfsc.c:1569) [...] Widen `dsm` to u64 and replace do_div() with div64_u64() so the full difference is preserved. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260326204310.1549327-1-xmei5@asu.edu Signed-off-by: Jakub Kicinski --- net/sched/sch_hfsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b5657ffbbf84..83b2ca2e37fc 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -555,7 +555,7 @@ static void rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) { u64 y1, y2, dx, dy; - u32 dsm; + u64 dsm; if (isc->sm1 <= isc->sm2) { /* service curve is convex */ @@ -598,7 +598,7 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) */ dx = (y1 - y) << SM_SHIFT; dsm = isc->sm1 - isc->sm2; - do_div(dx, dsm); + dx = div64_u64(dx, dsm); /* * check if (x, y1) belongs to the 1st segment of rtsc. * if so, add the offset. -- cgit v1.2.3 From fd63f185979b047fb22a0dfc6bd94d0cab6a6a70 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Mar 2026 10:52:57 +0100 Subject: ipv6: prevent possible UaF in addrconf_permanent_addr() The mentioned helper try to warn the user about an exceptional condition, but the message is delivered too late, accessing the ipv6 after its possible deletion. Reorder the statement to avoid the possible UaF; while at it, place the warning outside the idev->lock as it needs no protection. Reported-by: Jakub Kicinski Closes: https://sashiko.dev/#/patchset/8c8bfe2e1a324e501f0e15fef404a77443fd8caf.1774365668.git.pabeni%40redhat.com Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ef973c3a8cb4f8f1787ed469f3e5391b9fe95aa0.1774601542.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f4e23b543585..dd0b4d80e0f8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3625,12 +3625,12 @@ static void addrconf_permanent_addr(struct net *net, struct net_device *dev) if ((ifp->flags & IFA_F_PERMANENT) && fixup_permanent_addr(net, idev, ifp) < 0) { write_unlock_bh(&idev->lock); - in6_ifa_hold(ifp); - ipv6_del_addr(ifp); - write_lock_bh(&idev->lock); net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", idev->dev->name, &ifp->addr); + in6_ifa_hold(ifp); + ipv6_del_addr(ifp); + write_lock_bh(&idev->lock); } } -- cgit v1.2.3 From ddc748a391dd8642ba6b2e4fe22e7f2ddf84b7f0 Mon Sep 17 00:00:00 2001 From: Guoyu Su Date: Fri, 27 Mar 2026 23:35:07 +0800 Subject: net: use skb_header_pointer() for TCPv4 GSO frag_off check Syzbot reported a KMSAN uninit-value warning in gso_features_check() called from netif_skb_features() [1]. gso_features_check() reads iph->frag_off to decide whether to clear mangleid_features. Accessing the IPv4 header via ip_hdr()/inner_ip_hdr() can rely on skb header offsets that are not always safe for direct dereference on packets injected from PF_PACKET paths. Use skb_header_pointer() for the TCPv4 frag_off check so the header read is robust whether data is already linear or needs copying. [1] https://syzkaller.appspot.com/bug?extid=1543a7d954d9c6d00407 Link: https://lore.kernel.org/netdev/willemdebruijn.kernel.1a9f35039caab@gmail.com/ Fixes: cbc53e08a793 ("GSO: Add GSO type for fixed IPv4 ID") Reported-by: syzbot+1543a7d954d9c6d00407@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1543a7d954d9c6d00407 Tested-by: syzbot+1543a7d954d9c6d00407@syzkaller.appspotmail.com Signed-off-by: Guoyu Su Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260327153507.39742-1-yss2813483011xxl@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fc5557062414..831129f2a69b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3821,10 +3821,15 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { - struct iphdr *iph = skb->encapsulation ? - inner_ip_hdr(skb) : ip_hdr(skb); + const struct iphdr *iph; + struct iphdr _iph; + int nhoff = skb->encapsulation ? + skb_inner_network_offset(skb) : + skb_network_offset(skb); - if (!(iph->frag_off & htons(IP_DF))) + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + + if (!iph || !(iph->frag_off & htons(IP_DF))) features &= ~dev->mangleid_features; } -- cgit v1.2.3 From e6e3eb5ee89ac4c163d46429391c889a1bb5e404 Mon Sep 17 00:00:00 2001 From: Yochai Eisenrich Date: Sun, 29 Mar 2026 00:14:36 +0300 Subject: net: sched: cls_api: fix tc_chain_fill_node to initialize tcm_info to zero to prevent an info-leak When building netlink messages, tc_chain_fill_node() never initializes the tcm_info field of struct tcmsg. Since the allocation is not zeroed, kernel heap memory is leaked to userspace through this 4-byte field. The fix simply zeroes tcm_info alongside the other fields that are already initialized. Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Signed-off-by: Yochai Eisenrich Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260328211436.1010152-1-echelonh@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4829c27446e3..20f7f9ee0b35 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2969,6 +2969,7 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_handle = 0; + tcm->tcm_info = 0; if (block->q) { tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; tcm->tcm_parent = block->q->handle; -- cgit v1.2.3 From fa6e24963342de4370e3a3c9af41e38277b74cf3 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Fri, 27 Mar 2026 23:30:00 -0700 Subject: bridge: mrp: reject zero test interval to avoid OOM panic br_mrp_start_test() and br_mrp_start_in_test() accept the user-supplied interval value from netlink without validation. When interval is 0, usecs_to_jiffies(0) yields 0, causing the delayed work (br_mrp_test_work_expired / br_mrp_in_test_work_expired) to reschedule itself with zero delay. This creates a tight loop on system_percpu_wq that allocates and transmits MRP test frames at maximum rate, exhausting all system memory and causing a kernel panic via OOM deadlock. The same zero-interval issue applies to br_mrp_start_in_test_parse() for interconnect test frames. Use NLA_POLICY_MIN(NLA_U32, 1) in the nla_policy tables for both IFLA_BRIDGE_MRP_START_TEST_INTERVAL and IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL, so zero is rejected at the netlink attribute parsing layer before the value ever reaches the workqueue scheduling code. This is consistent with how other bridge subsystems (br_fdb, br_mst) enforce range constraints on netlink attributes. Fixes: 20f6a05ef635 ("bridge: mrp: Rework the MRP netlink interface") Fixes: 7ab1748e4ce6 ("bridge: mrp: Extend MRP netlink interface for configuring MRP interconnect") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260328063000.1845376-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/bridge/br_mrp_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c index ce6f63c77cc0..86f0e75d6e34 100644 --- a/net/bridge/br_mrp_netlink.c +++ b/net/bridge/br_mrp_netlink.c @@ -196,7 +196,7 @@ static const struct nla_policy br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, - [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = NLA_POLICY_MIN(NLA_U32, 1), [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, @@ -316,7 +316,7 @@ static const struct nla_policy br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = { [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT }, [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 }, - [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = NLA_POLICY_MIN(NLA_U32, 1), [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 }, [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From 9ca562bb8e66978b53028fa32b1a190708e6a091 Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Mon, 30 Mar 2026 16:46:24 +0800 Subject: net: ipv6: flowlabel: defer exclusive option free until RCU teardown `ip6fl_seq_show()` walks the global flowlabel hash under the seq-file RCU read-side lock and prints `fl->opt->opt_nflen` when an option block is present. Exclusive flowlabels currently free `fl->opt` as soon as `fl->users` drops to zero in `fl_release()`. However, the surrounding `struct ip6_flowlabel` remains visible in the global hash table until later garbage collection removes it and `fl_free_rcu()` finally tears it down. A concurrent `/proc/net/ip6_flowlabel` reader can therefore race that early `kfree()` and dereference freed option state, triggering a crash in `ip6fl_seq_show()`. Fix this by keeping `fl->opt` alive until `fl_free_rcu()`. That matches the lifetime already required for the enclosing flowlabel while readers can still reach it under RCU. Fixes: d3aedd5ebd4b ("ipv6 flowlabel: Convert hash list to RCU.") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Zhengchuan Liang Signed-off-by: Ren Wei Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/07351f0ec47bcee289576f39f9354f4a64add6e4.1774855883.git.zcliangcn@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_flowlabel.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 7c12bf75beed..c92f98c6f6ec 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -133,11 +133,6 @@ static void fl_release(struct ip6_flowlabel *fl) if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; - if (fl->opt && fl->share == IPV6_FL_S_EXCL) { - struct ipv6_txoptions *opt = fl->opt; - fl->opt = NULL; - kfree(opt); - } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); -- cgit v1.2.3 From 5dd8025a49c268ab6b94d978532af3ad341132a7 Mon Sep 17 00:00:00 2001 From: Li Xiasong Date: Mon, 30 Mar 2026 20:03:35 +0800 Subject: mptcp: fix soft lockup in mptcp_recvmsg() syzbot reported a soft lockup in mptcp_recvmsg() [0]. When receiving data with MSG_PEEK | MSG_WAITALL flags, the skb is not removed from the sk_receive_queue. This causes sk_wait_data() to always find available data and never perform actual waiting, leading to a soft lockup. Fix this by adding a 'last' parameter to track the last peeked skb. This allows sk_wait_data() to make informed waiting decisions and prevent infinite loops when MSG_PEEK is used. [0]: watchdog: BUG: soft lockup - CPU#2 stuck for 156s! [server:1963] Modules linked in: CPU: 2 UID: 0 PID: 1963 Comm: server Not tainted 6.19.0-rc8 #61 PREEMPT(none) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:sk_wait_data+0x15/0x190 Code: 80 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 41 56 41 55 41 54 49 89 f4 55 48 89 d5 53 48 89 fb <48> 83 ec 30 65 48 8b 05 17 a4 6b 01 48 89 44 24 28 31 c0 65 48 8b RSP: 0018:ffffc90000603ca0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff888102bf0800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffffc90000603d18 RDI: ffff888102bf0800 RBP: 0000000000000000 R08: 0000000000000002 R09: 0000000000000101 R10: 0000000000000000 R11: 0000000000000075 R12: ffffc90000603d18 R13: ffff888102bf0800 R14: ffff888102bf0800 R15: 0000000000000000 FS: 00007f6e38b8c4c0(0000) GS:ffff8881b877e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055aa7bff1680 CR3: 0000000105cbe000 CR4: 00000000000006f0 Call Trace: mptcp_recvmsg+0x547/0x8c0 net/mptcp/protocol.c:2329 inet_recvmsg+0x11f/0x130 net/ipv4/af_inet.c:891 sock_recvmsg+0x94/0xc0 net/socket.c:1100 __sys_recvfrom+0xb2/0x130 net/socket.c:2256 __x64_sys_recvfrom+0x1f/0x30 net/socket.c:2267 do_syscall_64+0x59/0x2d0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x76/0x7e arch/x86/entry/entry_64.S:131 RIP: 0033:0x7f6e386a4a1d Code: 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8d 05 f1 de 2c 00 41 89 ca 8b 00 85 c0 75 20 45 31 c9 45 31 c0 b8 2d 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 6b f3 c3 66 0f 1f 84 00 00 00 00 00 41 56 41 RSP: 002b:00007ffc3c4bb078 EFLAGS: 00000246 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 000000000000861e RCX: 00007f6e386a4a1d RDX: 00000000000003ff RSI: 00007ffc3c4bb150 RDI: 0000000000000004 RBP: 00007ffc3c4bb570 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000103 R11: 0000000000000246 R12: 00005605dbc00be0 R13: 00007ffc3c4bb650 R14: 0000000000000000 R15: 0000000000000000 Fixes: 8e04ce45a8db ("mptcp: fix MSG_PEEK stream corruption") Signed-off-by: Li Xiasong Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260330120335.659027-1-lixiasong1@huawei.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cf1852b99963..65c3bb8016f4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2006,7 +2006,7 @@ static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, - int *cmsg_flags) + int *cmsg_flags, struct sk_buff **last) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; @@ -2023,6 +2023,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, /* skip already peeked skbs */ if (total_data_len + data_len <= copied_total) { total_data_len += data_len; + *last = skb; continue; } @@ -2058,6 +2059,8 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, } mptcp_eat_recv_skb(sk, skb); + } else { + *last = skb; } if (copied >= len) @@ -2288,10 +2291,12 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { + struct sk_buff *last = NULL; int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, - copied, &tss, &cmsg_flags); + copied, &tss, &cmsg_flags, + &last); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2343,7 +2348,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, pr_debug("block timeout %ld\n", timeo); mptcp_cleanup_rbuf(msk, copied); - err = sk_wait_data(sk, &timeo, NULL); + err = sk_wait_data(sk, &timeo, last); if (err < 0) { err = copied ? : err; goto out_err; -- cgit v1.2.3 From 76522fcdbc3a02b568f5d957f7e66fc194abb893 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 26 Mar 2026 00:17:09 +0100 Subject: netfilter: flowtable: strictly check for maximum number of actions The maximum number of flowtable hardware offload actions in IPv6 is: * ethernet mangling (4 payload actions, 2 for each ethernet address) * SNAT (4 payload actions) * DNAT (4 payload actions) * Double VLAN (4 vlan actions, 2 for popping vlan, and 2 for pushing) for QinQ. * Redirect (1 action) Which makes 17, while the maximum is 16. But act_ct supports for tunnels actions too. Note that payload action operates at 32-bit word level, so mangling an IPv6 address takes 4 payload actions. Update flow_action_entry_next() calls to check for the maximum number of supported actions. While at it, rise the maximum number of actions per flow from 16 to 24 so this works fine with IPv6 setups. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Hyunwoo Kim Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 196 ++++++++++++++++++++++------------ 1 file changed, 130 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 9b677e116487..93d0aa7f8fcc 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -14,6 +14,8 @@ #include #include +#define NF_FLOW_RULE_ACTION_MAX 24 + static struct workqueue_struct *nf_flow_offload_add_wq; static struct workqueue_struct *nf_flow_offload_del_wq; static struct workqueue_struct *nf_flow_offload_stats_wq; @@ -216,7 +218,12 @@ static void flow_offload_mangle(struct flow_action_entry *entry, static inline struct flow_action_entry * flow_action_entry_next(struct nf_flow_rule *flow_rule) { - int i = flow_rule->rule->action.num_entries++; + int i; + + if (unlikely(flow_rule->rule->action.num_entries >= NF_FLOW_RULE_ACTION_MAX)) + return NULL; + + i = flow_rule->rule->action.num_entries++; return &flow_rule->rule->action.entries[i]; } @@ -234,6 +241,9 @@ static int flow_offload_eth_src(struct net *net, u32 mask, val; u16 val16; + if (!entry0 || !entry1) + return -E2BIG; + this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { @@ -284,6 +294,9 @@ static int flow_offload_eth_dst(struct net *net, u8 nud_state; u16 val16; + if (!entry0 || !entry1) + return -E2BIG; + this_tuple = &flow->tuplehash[dir].tuple; switch (this_tuple->xmit_type) { @@ -325,16 +338,19 @@ static int flow_offload_eth_dst(struct net *net, return 0; } -static void flow_offload_ipv4_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; @@ -345,23 +361,27 @@ static void flow_offload_ipv4_snat(struct net *net, offset = offsetof(struct iphdr, daddr); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); + return 0; } -static void flow_offload_ipv4_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask = ~htonl(0xffffffff); __be32 addr; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; @@ -372,14 +392,15 @@ static void flow_offload_ipv4_dnat(struct net *net, offset = offsetof(struct iphdr, saddr); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, &addr, &mask); + return 0; } -static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, +static int flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, unsigned int offset, const __be32 *addr, const __be32 *mask) { @@ -388,15 +409,20 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, offset + i * sizeof(u32), &addr[i], mask); } + + return 0; } -static void flow_offload_ipv6_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; @@ -412,16 +438,16 @@ static void flow_offload_ipv6_snat(struct net *net, offset = offsetof(struct ipv6hdr, daddr); break; default: - return; + return -EOPNOTSUPP; } - flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); + return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } -static void flow_offload_ipv6_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { u32 mask = ~htonl(0xffffffff); const __be32 *addr; @@ -437,10 +463,10 @@ static void flow_offload_ipv6_dnat(struct net *net, offset = offsetof(struct ipv6hdr, saddr); break; default: - return; + return -EOPNOTSUPP; } - flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); + return flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); } static int flow_offload_l4proto(const struct flow_offload *flow) @@ -462,15 +488,18 @@ static int flow_offload_l4proto(const struct flow_offload *flow) return type; } -static void flow_offload_port_snat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); @@ -485,22 +514,26 @@ static void flow_offload_port_snat(struct net *net, mask = ~htonl(0xffff); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); + return 0; } -static void flow_offload_port_dnat(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { struct flow_action_entry *entry = flow_action_entry_next(flow_rule); u32 mask, port; u32 offset; + if (!entry) + return -E2BIG; + switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); @@ -515,20 +548,24 @@ static void flow_offload_port_dnat(struct net *net, mask = ~htonl(0xffff0000); break; default: - return; + return -EOPNOTSUPP; } flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, &port, &mask); + return 0; } -static void flow_offload_ipv4_checksum(struct net *net, - const struct flow_offload *flow, - struct nf_flow_rule *flow_rule) +static int flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) { u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; + entry->id = FLOW_ACTION_CSUM; entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; @@ -540,12 +577,14 @@ static void flow_offload_ipv4_checksum(struct net *net, entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; break; } + + return 0; } -static void flow_offload_redirect(struct net *net, - const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_redirect(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple, *other_tuple; struct flow_action_entry *entry; @@ -563,21 +602,28 @@ static void flow_offload_redirect(struct net *net, ifindex = other_tuple->iifidx; break; default: - return; + return -EOPNOTSUPP; } dev = dev_get_by_index(net, ifindex); if (!dev) - return; + return -ENODEV; entry = flow_action_entry_next(flow_rule); + if (!entry) { + dev_put(dev); + return -E2BIG; + } + entry->id = FLOW_ACTION_REDIRECT; entry->dev = dev; + + return 0; } -static void flow_offload_encap_tunnel(const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_encap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *this_tuple; struct flow_action_entry *entry; @@ -585,7 +631,7 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow, this_tuple = &flow->tuplehash[dir].tuple; if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) - return; + return 0; dst = this_tuple->dst_cache; if (dst && dst->lwtstate) { @@ -594,15 +640,19 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow, tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; entry->id = FLOW_ACTION_TUNNEL_ENCAP; entry->tunnel = tun_info; } } + + return 0; } -static void flow_offload_decap_tunnel(const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int flow_offload_decap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { const struct flow_offload_tuple *other_tuple; struct flow_action_entry *entry; @@ -610,7 +660,7 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, other_tuple = &flow->tuplehash[!dir].tuple; if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) - return; + return 0; dst = other_tuple->dst_cache; if (dst && dst->lwtstate) { @@ -619,9 +669,13 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, tun_info = lwt_tun_info(dst->lwtstate); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -E2BIG; entry->id = FLOW_ACTION_TUNNEL_DECAP; } } + + return 0; } static int @@ -633,8 +687,9 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, const struct flow_offload_tuple *tuple; int i; - flow_offload_decap_tunnel(flow, dir, flow_rule); - flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_decap_tunnel(flow, dir, flow_rule) < 0 || + flow_offload_encap_tunnel(flow, dir, flow_rule) < 0) + return -1; if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) @@ -650,6 +705,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, if (tuple->encap[i].proto == htons(ETH_P_8021Q)) { entry = flow_action_entry_next(flow_rule); + if (!entry) + return -1; entry->id = FLOW_ACTION_VLAN_POP; } } @@ -663,6 +720,8 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, continue; entry = flow_action_entry_next(flow_rule); + if (!entry) + return -1; switch (other_tuple->encap[i].proto) { case htons(ETH_P_PPP_SES): @@ -688,18 +747,22 @@ int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { - flow_offload_ipv4_snat(net, flow, dir, flow_rule); - flow_offload_port_snat(net, flow, dir, flow_rule); + if (flow_offload_ipv4_snat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_snat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { - flow_offload_ipv4_dnat(net, flow, dir, flow_rule); - flow_offload_port_dnat(net, flow, dir, flow_rule); + if (flow_offload_ipv4_dnat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_dnat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_SNAT, &flow->flags) || test_bit(NF_FLOW_DNAT, &flow->flags)) - flow_offload_ipv4_checksum(net, flow, flow_rule); + if (flow_offload_ipv4_checksum(net, flow, flow_rule) < 0) + return -1; - flow_offload_redirect(net, flow, dir, flow_rule); + if (flow_offload_redirect(net, flow, dir, flow_rule) < 0) + return -1; return 0; } @@ -713,22 +776,23 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { - flow_offload_ipv6_snat(net, flow, dir, flow_rule); - flow_offload_port_snat(net, flow, dir, flow_rule); + if (flow_offload_ipv6_snat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_snat(net, flow, dir, flow_rule) < 0) + return -1; } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { - flow_offload_ipv6_dnat(net, flow, dir, flow_rule); - flow_offload_port_dnat(net, flow, dir, flow_rule); + if (flow_offload_ipv6_dnat(net, flow, dir, flow_rule) < 0 || + flow_offload_port_dnat(net, flow, dir, flow_rule) < 0) + return -1; } - flow_offload_redirect(net, flow, dir, flow_rule); + if (flow_offload_redirect(net, flow, dir, flow_rule) < 0) + return -1; return 0; } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); -#define NF_FLOW_RULE_ACTION_MAX 16 - static struct nf_flow_rule * nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload_work *offload, -- cgit v1.2.3 From 6d52a4a0520a6696bdde51caa11f2d6821cd0c01 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Mar 2026 16:17:24 +0100 Subject: netfilter: nfnetlink_log: account for netlink header size This is a followup to an old bug fix: NLMSG_DONE needs to account for the netlink header size, not just the attribute size. This can result in a WARN splat + drop of the netlink message, but other than this there are no ill effects. Fixes: 9dfa1dfe4d5e ("netfilter: nf_log: account for size of NLMSG_DONE attribute") Reported-by: Yiming Qian Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index fcbe54940b2e..f80978c06fa0 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -726,7 +726,7 @@ nfulnl_log_packet(struct net *net, + nla_total_size(plen) /* prefix */ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)) - + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ + + nlmsg_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */ if (in && skb_mac_header_was_set(skb)) { size += nla_total_size(skb->dev->hard_header_len) -- cgit v1.2.3 From a958a4f90ddd7de0800b33ca9d7b886b7d40f74e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 31 Mar 2026 23:13:36 +0200 Subject: netfilter: x_tables: ensure names are nul-terminated Reject names that lack a \0 character before feeding them to functions that expect c-strings. Fixes tag is the most recent commit that needs this change. Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_cgroup.c | 6 ++++++ net/netfilter/xt_rateest.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c437fbd59ec1..43d2ae2be628 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -65,6 +65,9 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) info->priv = NULL; if (info->has_path) { + if (strnlen(info->path, sizeof(info->path)) >= sizeof(info->path)) + return -ENAMETOOLONG; + cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", @@ -102,6 +105,9 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) info->priv = NULL; if (info->has_path) { + if (strnlen(info->path, sizeof(info->path)) >= sizeof(info->path)) + return -ENAMETOOLONG; + cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 72324bd976af..b1d736c15fcb 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -91,6 +91,11 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) goto err1; } + if (strnlen(info->name1, sizeof(info->name1)) >= sizeof(info->name1)) + return -ENAMETOOLONG; + if (strnlen(info->name2, sizeof(info->name2)) >= sizeof(info->name2)) + return -ENAMETOOLONG; + ret = -ENOENT; est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) -- cgit v1.2.3 From b7e8590987aa94c9dc51518fad0e58cb887b1db5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Mar 2026 14:16:34 +0200 Subject: netfilter: ipset: use nla_strcmp for IPSET_ATTR_NAME attr IPSET_ATTR_NAME and IPSET_ATTR_NAMEREF are of NLA_STRING type, they cannot be treated like a c-string. They either have to be switched to NLA_NUL_STRING, or the compare operations need to use the nla functions. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_core.c | 4 ++-- net/netfilter/ipset/ip_set_list_set.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index e9f4f845d760..b98331572ad2 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -309,7 +309,7 @@ enum { /* register and unregister set references */ extern ip_set_id_t ip_set_get_byname(struct net *net, - const char *name, struct ip_set **set); + const struct nlattr *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index a2fe711cb5e3..d0c9fe59c67d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -821,7 +821,7 @@ EXPORT_SYMBOL_GPL(ip_set_del); * */ ip_set_id_t -ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) +ip_set_get_byname(struct net *net, const struct nlattr *name, struct ip_set **set) { ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; @@ -830,7 +830,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s && STRNCMP(s->name, name)) { + if (s && nla_strcmp(name, s->name) == 0) { __ip_set_get(s); index = i; *set = s; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 98b91b34c314..1cef84f15e8c 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -367,7 +367,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); + e.id = ip_set_get_byname(map->net, tb[IPSET_ATTR_NAME], &s); if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ @@ -389,7 +389,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_NAMEREF]) { e.refid = ip_set_get_byname(map->net, - nla_data(tb[IPSET_ATTR_NAMEREF]), + tb[IPSET_ATTR_NAMEREF], &s); if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; -- cgit v1.2.3 From a242a9ae58aa46ff7dae51ce64150a93957abe65 Mon Sep 17 00:00:00 2001 From: Qi Tang Date: Mon, 30 Mar 2026 00:50:36 +0800 Subject: netfilter: nf_conntrack_helper: pass helper to expect cleanup nf_conntrack_helper_unregister() calls nf_ct_expect_iterate_destroy() to remove expectations belonging to the helper being unregistered. However, it passes NULL instead of the helper pointer as the data argument, so expect_iter_me() never matches any expectation and all of them survive the cleanup. After unregister returns, nfnl_cthelper_del() frees the helper object immediately. Subsequent expectation dumps or packet-driven init_conntrack() calls then dereference the freed exp->helper, causing a use-after-free. Pass the actual helper pointer so expectations referencing it are properly destroyed before the helper object is freed. BUG: KASAN: slab-use-after-free in string+0x38f/0x430 Read of size 1 at addr ffff888003b14d20 by task poc/103 Call Trace: string+0x38f/0x430 vsnprintf+0x3cc/0x1170 seq_printf+0x17a/0x240 exp_seq_show+0x2e5/0x560 seq_read_iter+0x419/0x1280 proc_reg_read+0x1ac/0x270 vfs_read+0x179/0x930 ksys_read+0xef/0x1c0 Freed by task 103: The buggy address is located 32 bytes inside of freed 192-byte region [ffff888003b14d00, ffff888003b14dc0) Fixes: ac7b84839003 ("netfilter: expect: add and use nf_ct_expect_iterate helpers") Signed-off-by: Qi Tang Reviewed-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 1b330ba6613b..a715304a53d8 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -415,7 +415,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) */ synchronize_rcu(); - nf_ct_expect_iterate_destroy(expect_iter_me, NULL); + nf_ct_expect_iterate_destroy(expect_iter_me, me); nf_ct_iterate_destroy(unhelp, me); /* nf_ct_iterate_destroy() does an unconditional synchronize_rcu() as -- cgit v1.2.3 From 35177c6877134a21315f37d57a5577846225623e Mon Sep 17 00:00:00 2001 From: Qi Tang Date: Tue, 31 Mar 2026 14:17:12 +0800 Subject: netfilter: ctnetlink: zero expect NAT fields when CTA_EXPECT_NAT absent ctnetlink_alloc_expect() allocates expectations from a non-zeroing slab cache via nf_ct_expect_alloc(). When CTA_EXPECT_NAT is not present in the netlink message, saved_addr and saved_proto are never initialized. Stale data from a previous slab occupant can then be dumped to userspace by ctnetlink_exp_dump_expect(), which checks these fields to decide whether to emit CTA_EXPECT_NAT. The safe sibling nf_ct_expect_init(), used by the packet path, explicitly zeroes these fields. Zero saved_addr, saved_proto and dir in the else branch, guarded by IS_ENABLED(CONFIG_NF_NAT) since these fields only exist when NAT is enabled. Confirmed by priming the expect slab with NAT-bearing expectations, freeing them, creating a new expectation without CTA_EXPECT_NAT, and observing that the ctnetlink dump emits a spurious CTA_EXPECT_NAT containing stale data from the prior allocation. Fixes: 076a0ca02644 ("netfilter: ctnetlink: add NAT support for expectations") Reported-by: kernel test robot Signed-off-by: Qi Tang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3f408f3713bb..38bd7124d9f7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3588,6 +3588,12 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp, nf_ct_l3num(ct)); if (err < 0) goto err_out; +#if IS_ENABLED(CONFIG_NF_NAT) + } else { + memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); + memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); + exp->dir = 0; +#endif } return exp; err_out: -- cgit v1.2.3 From 917b61fa2042f11e2af4c428e43f08199586633a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 Mar 2026 11:26:22 +0200 Subject: netfilter: ctnetlink: ignore explicit helper on new expectations Use the existing master conntrack helper, anything else is not really supported and it just makes validation more complicated, so just ignore what helper userspace suggests for this expectation. This was uncovered when validating CTA_EXPECT_CLASS via different helper provided by userspace than the existing master conntrack helper: BUG: KASAN: slab-out-of-bounds in nf_ct_expect_related_report+0x2479/0x27c0 Read of size 4 at addr ffff8880043fe408 by task poc/102 Call Trace: nf_ct_expect_related_report+0x2479/0x27c0 ctnetlink_create_expect+0x22b/0x3b0 ctnetlink_new_expect+0x4bd/0x5c0 nfnetlink_rcv_msg+0x67a/0x950 netlink_rcv_skb+0x120/0x350 Allowing to read kernel memory bytes off the expectation boundary. CTA_EXPECT_HELP_NAME is still used to offer the helper name to userspace via netlink dump. Fixes: bd0779370588 ("netfilter: nfnetlink_queue: allow to attach expectations to conntracks") Reported-by: Qi Tang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 54 ++++++------------------------------ 1 file changed, 9 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 38bd7124d9f7..a20cd82446c5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2636,7 +2636,6 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, - struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2865,7 +2864,6 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, { struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; - struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; int err; @@ -2879,17 +2877,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; - if (cda[CTA_EXPECT_HELP_NAME]) { - const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); - - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper == NULL) - return -EOPNOTSUPP; - } - exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - helper, &tuple, &mask); + &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3528,11 +3517,11 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, - struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { struct net *net = read_pnet(&ct->ct_net); + struct nf_conntrack_helper *helper; struct nf_conntrack_expect *exp; struct nf_conn_help *help; u32 class = 0; @@ -3542,7 +3531,11 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, if (!help) return ERR_PTR(-EOPNOTSUPP); - if (cda[CTA_EXPECT_CLASS] && helper) { + helper = rcu_dereference(help->helper); + if (!helper) + return ERR_PTR(-EOPNOTSUPP); + + if (cda[CTA_EXPECT_CLASS]) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); if (class > helper->expect_class_max) return ERR_PTR(-EINVAL); @@ -3576,8 +3569,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; #endif - if (!helper) - helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; @@ -3609,7 +3600,6 @@ ctnetlink_create_expect(struct net *net, { struct nf_conntrack_tuple tuple, mask, master_tuple; struct nf_conntrack_tuple_hash *h = NULL; - struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; struct nf_conn *ct; int err; @@ -3635,33 +3625,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - if (cda[CTA_EXPECT_HELP_NAME]) { - const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); - - helper = __nf_conntrack_helper_find(helpname, u3, - nf_ct_protonum(ct)); - if (helper == NULL) { - rcu_read_unlock(); -#ifdef CONFIG_MODULES - if (request_module("nfct-helper-%s", helpname) < 0) { - err = -EOPNOTSUPP; - goto err_ct; - } - rcu_read_lock(); - helper = __nf_conntrack_helper_find(helpname, u3, - nf_ct_protonum(ct)); - if (helper) { - err = -EAGAIN; - goto err_rcu; - } - rcu_read_unlock(); -#endif - err = -EOPNOTSUPP; - goto err_ct; - } - } - - exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; @@ -3671,8 +3635,8 @@ ctnetlink_create_expect(struct net *net, nf_ct_expect_put(exp); err_rcu: rcu_read_unlock(); -err_ct: nf_ct_put(ct); + return err; } -- cgit v1.2.3 From 9862ef9ab0a116c6dca98842aab7de13a252ae02 Mon Sep 17 00:00:00 2001 From: Yifan Wu Date: Mon, 30 Mar 2026 14:39:24 -0700 Subject: netfilter: ipset: drop logically empty buckets in mtype_del mtype_del() counts empty slots below n->pos in k, but it only drops the bucket when both n->pos and k are zero. This misses buckets whose live entries have all been removed while n->pos still points past deleted slots. Treat a bucket as empty when all positions below n->pos are unused and release it directly instead of shrinking it further. Fixes: 8af1c6fbd923 ("netfilter: ipset: Fix forceadd evaluation path") Cc: stable@vger.kernel.org Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Yifan Wu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Reviewed-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 181daa9c2019..b79e5dd2af03 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1098,7 +1098,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!test_bit(i, n->used)) k++; } - if (n->pos == 0 && k == 0) { + if (k == n->pos) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); -- cgit v1.2.3 From 3d5d488f11776738deab9da336038add95d342d1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 31 Mar 2026 16:41:25 +0200 Subject: netfilter: x_tables: restrict xt_check_match/xt_check_target extensions for NFPROTO_ARP Weiming Shi says: xt_match and xt_target structs registered with NFPROTO_UNSPEC can be loaded by any protocol family through nft_compat. When such a match/target sets .hooks to restrict which hooks it may run on, the bitmask uses NF_INET_* constants. This is only correct for families whose hook layout matches NF_INET_*: IPv4, IPv6, INET, and bridge all share the same five hooks (PRE_ROUTING ... POST_ROUTING). ARP only has three hooks (IN=0, OUT=1, FORWARD=2) with different semantics. Because NF_ARP_OUT == 1 == NF_INET_LOCAL_IN, the .hooks validation silently passes for the wrong reasons, allowing matches to run on ARP chains where the hook assumptions (e.g. state->in being set on input hooks) do not hold. This leads to NULL pointer dereferences; xt_devgroup is one concrete example: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000044: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000220-0x0000000000000227] RIP: 0010:devgroup_mt+0xff/0x350 Call Trace: nft_match_eval (net/netfilter/nft_compat.c:407) nft_do_chain (net/netfilter/nf_tables_core.c:285) nft_do_chain_arp (net/netfilter/nft_chain_filter.c:61) nf_hook_slow (net/netfilter/core.c:623) arp_xmit (net/ipv4/arp.c:666) Kernel panic - not syncing: Fatal exception in interrupt Fix it by restricting arptables to NFPROTO_ARP extensions only. Note that arptables-legacy only supports: - arpt_CLASSIFY - arpt_mangle - arpt_MARK that provide explicit NFPROTO_ARP match/target declarations. Fixes: 9291747f118d ("netfilter: xtables: add device group match") Reported-by: Xiang Mei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e594b3b7ad82..b39017c80548 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -501,6 +501,17 @@ int xt_check_match(struct xt_mtchk_param *par, par->match->table, par->table); return -EINVAL; } + + /* NFPROTO_UNSPEC implies NF_INET_* hooks which do not overlap with + * NF_ARP_IN,OUT,FORWARD, allow explicit extensions with NFPROTO_ARP + * support. + */ + if (par->family == NFPROTO_ARP && + par->match->family != NFPROTO_ARP) { + pr_info_ratelimited("%s_tables: %s match: not valid for this family\n", + xt_prefix[par->family], par->match->name); + return -EINVAL; + } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; @@ -1016,6 +1027,18 @@ int xt_check_target(struct xt_tgchk_param *par, par->target->table, par->table); return -EINVAL; } + + /* NFPROTO_UNSPEC implies NF_INET_* hooks which do not overlap with + * NF_ARP_IN,OUT,FORWARD, allow explicit extensions with NFPROTO_ARP + * support. + */ + if (par->family == NFPROTO_ARP && + par->target->family != NFPROTO_ARP) { + pr_info_ratelimited("%s_tables: %s target: not valid for this family\n", + xt_prefix[par->family], par->target->name); + return -EINVAL; + } + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { char used[64], allow[64]; -- cgit v1.2.3 From da107398cbd4bbdb6bffecb2ce86d5c9384f4cec Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 31 Mar 2026 23:08:02 +0200 Subject: netfilter: nf_tables: reject immediate NF_QUEUE verdict nft_queue is always used from userspace nftables to deliver the NF_QUEUE verdict. Immediately emitting an NF_QUEUE verdict is never used by the userspace nft tools, so reject immediate NF_QUEUE verdicts. The arp family does not provide queue support, but such an immediate verdict is still reachable. Globally reject NF_QUEUE immediate verdicts to address this issue. Fixes: f342de4e2f33 ("netfilter: nf_tables: reject QUEUE/DROP verdict parameters") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3922cff1bb3d..8c42247a176c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11667,8 +11667,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, switch (data->verdict.code) { case NF_ACCEPT: case NF_DROP: - case NF_QUEUE: - break; case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: @@ -11703,6 +11701,11 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, data->verdict.chain = chain; break; + case NF_QUEUE: + /* The nft_queue expression is used for this purpose, an + * immediate NF_QUEUE verdict should not ever be seen here. + */ + fallthrough; default: return -EINVAL; } -- cgit v1.2.3 From a834a0b66ec6fb743377201a0f4229bb2503f4ce Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Wed, 25 Mar 2026 21:07:46 +0200 Subject: Bluetooth: hci_sync: call destroy in hci_cmd_sync_run if immediate hci_cmd_sync_run() may run the work immediately if called from existing sync work (otherwise it queues a new sync work). In this case it fails to call the destroy() function. On immediate run, make it behave same way as if item was queued successfully: call destroy, and return 0. The only callsite is hci_abort_conn() via hci_cmd_sync_run_once(), and this changes its return value. However, its return value is not used except as the return value for hci_disconnect(), and nothing uses the return value of hci_disconnect(). Hence there should be no behavior change anywhere. Fixes: c898f6d7b093b ("Bluetooth: hci_sync: Introduce hci_cmd_sync_run/hci_cmd_sync_run_once") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 45d16639874a..6283a4df78b0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -801,8 +801,15 @@ int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, return -ENETDOWN; /* If on cmd_sync_work then run immediately otherwise queue */ - if (current_work() == &hdev->cmd_sync_work) - return func(hdev, data); + if (current_work() == &hdev->cmd_sync_work) { + int err; + + err = func(hdev, data); + if (destroy) + destroy(hdev, data, err); + + return 0; + } return hci_cmd_sync_submit(hdev, func, data, destroy); } -- cgit v1.2.3 From 8a5b0135d4a5d9683203a3d9a12a711ccec5936b Mon Sep 17 00:00:00 2001 From: Cen Zhang Date: Thu, 26 Mar 2026 23:16:45 +0800 Subject: Bluetooth: SCO: fix race conditions in sco_sock_connect() sco_sock_connect() checks sk_state and sk_type without holding the socket lock. Two concurrent connect() syscalls on the same socket can both pass the check and enter sco_connect(), leading to use-after-free. The buggy scenario involves three participants and was confirmed with additional logging instrumentation: Thread A (connect): HCI disconnect: Thread B (connect): sco_sock_connect(sk) sco_sock_connect(sk) sk_state==BT_OPEN sk_state==BT_OPEN (pass, no lock) (pass, no lock) sco_connect(sk): sco_connect(sk): hci_dev_lock hci_dev_lock hci_connect_sco <- blocked -> hcon1 sco_conn_add->conn1 lock_sock(sk) sco_chan_add: conn1->sk = sk sk->conn = conn1 sk_state=BT_CONNECT release_sock hci_dev_unlock hci_dev_lock sco_conn_del: lock_sock(sk) sco_chan_del: sk->conn=NULL conn1->sk=NULL sk_state= BT_CLOSED SOCK_ZAPPED release_sock hci_dev_unlock (unblocked) hci_connect_sco -> hcon2 sco_conn_add -> conn2 lock_sock(sk) sco_chan_add: sk->conn=conn2 sk_state= BT_CONNECT // zombie sk! release_sock hci_dev_unlock Thread B revives a BT_CLOSED + SOCK_ZAPPED socket back to BT_CONNECT. Subsequent cleanup triggers double sock_put() and use-after-free. Meanwhile conn1 is leaked as it was orphaned when sco_conn_del() cleared the association. Fix this by: - Moving lock_sock() before the sk_state/sk_type checks in sco_sock_connect() to serialize concurrent connect attempts - Fixing the sk_type != SOCK_SEQPACKET check to actually return the error instead of just assigning it - Adding a state re-check in sco_connect() after lock_sock() to catch state changes during the window between the locks - Adding sco_pi(sk)->conn check in sco_chan_add() to prevent double-attach of a socket to multiple connections - Adding hci_conn_drop() on sco_chan_add failure to prevent HCI connection leaks Fixes: 9a8ec9e8ebb5 ("Bluetooth: SCO: Fix possible circular locking dependency on sco_connect_cfm") Signed-off-by: Cen Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 584e059de20a..b84587811ef4 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -298,7 +298,7 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, int err = 0; sco_conn_lock(conn); - if (conn->sk) + if (conn->sk || sco_pi(sk)->conn) err = -EBUSY; else __sco_chan_add(conn, sk, parent); @@ -353,9 +353,20 @@ static int sco_connect(struct sock *sk) lock_sock(sk); + /* Recheck state after reacquiring the socket lock, as another + * thread may have changed it (e.g., closed the socket). + */ + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { + release_sock(sk); + hci_conn_drop(hcon); + err = -EBADFD; + goto unlock; + } + err = sco_chan_add(conn, sk, NULL); if (err) { release_sock(sk); + hci_conn_drop(hcon); goto unlock; } @@ -656,13 +667,18 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, addr->sa_family != AF_BLUETOOTH) return -EINVAL; - if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) + lock_sock(sk); + + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { + release_sock(sk); return -EBADFD; + } - if (sk->sk_type != SOCK_SEQPACKET) - err = -EINVAL; + if (sk->sk_type != SOCK_SEQPACKET) { + release_sock(sk); + return -EINVAL; + } - lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); release_sock(sk); -- cgit v1.2.3 From 2b2bf47cd75518c36fa2d41380e4a40641cc89cd Mon Sep 17 00:00:00 2001 From: Oleh Konko Date: Thu, 26 Mar 2026 17:31:24 +0000 Subject: Bluetooth: hci_event: move wake reason storage into validated event handlers hci_store_wake_reason() is called from hci_event_packet() immediately after stripping the HCI event header but before hci_event_func() enforces the per-event minimum payload length from hci_ev_table. This means a short HCI event frame can reach bacpy() before any bounds check runs. Rather than duplicating skb parsing and per-event length checks inside hci_store_wake_reason(), move wake-address storage into the individual event handlers after their existing event-length validation has succeeded. Convert hci_store_wake_reason() into a small helper that only stores an already-validated bdaddr while the caller holds hci_dev_lock(). Use the same helper after hci_event_func() with a NULL address to preserve the existing unexpected-wake fallback semantics when no validated event handler records a wake address. Annotate the helper with __must_hold(&hdev->lock) and add lockdep_assert_held(&hdev->lock) so future call paths keep the lock contract explicit. Call the helper from hci_conn_request_evt(), hci_conn_complete_evt(), hci_sync_conn_complete_evt(), le_conn_complete_evt(), hci_le_adv_report_evt(), hci_le_ext_adv_report_evt(), hci_le_direct_adv_report_evt(), hci_le_pa_sync_established_evt(), and hci_le_past_received_evt(). Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events") Cc: stable@vger.kernel.org Signed-off-by: Oleh Konko Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 94 ++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 286529d2e554..81d2f9a3eec9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -80,6 +80,10 @@ static void *hci_le_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, return data; } +static void hci_store_wake_reason(struct hci_dev *hdev, + const bdaddr_t *bdaddr, u8 addr_type) + __must_hold(&hdev->lock); + static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3111,6 +3115,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); + hci_store_wake_reason(hdev, &ev->bdaddr, BDADDR_BREDR); /* Check for existing connection: * @@ -3274,6 +3279,10 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "bdaddr %pMR type 0x%x", &ev->bdaddr, ev->link_type); + hci_dev_lock(hdev); + hci_store_wake_reason(hdev, &ev->bdaddr, BDADDR_BREDR); + hci_dev_unlock(hdev); + /* Reject incoming connection from device with same BD ADDR against * CVE-2020-26555 */ @@ -5021,6 +5030,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); + hci_store_wake_reason(hdev, &ev->bdaddr, BDADDR_BREDR); conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { @@ -5713,6 +5723,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, int err; hci_dev_lock(hdev); + hci_store_wake_reason(hdev, bdaddr, bdaddr_type); /* All controllers implicitly stop advertising in the event of a * connection, so ensure that the state bit is cleared. @@ -6005,6 +6016,7 @@ static void hci_le_past_received_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); hci_dev_lock(hdev); + hci_store_wake_reason(hdev, &ev->bdaddr, ev->bdaddr_type); hci_dev_clear_flag(hdev, HCI_PA_SYNC); @@ -6403,6 +6415,8 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, info->length + 1)) break; + hci_store_wake_reason(hdev, &info->bdaddr, info->bdaddr_type); + if (info->length <= max_adv_len(hdev)) { rssi = info->data[info->length]; process_adv_report(hdev, info->type, &info->bdaddr, @@ -6491,6 +6505,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, info->length)) break; + hci_store_wake_reason(hdev, &info->bdaddr, info->bdaddr_type); + evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); @@ -6536,6 +6552,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); hci_dev_lock(hdev); + hci_store_wake_reason(hdev, &ev->bdaddr, ev->bdaddr_type); hci_dev_clear_flag(hdev, HCI_PA_SYNC); @@ -6834,6 +6851,8 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data, for (i = 0; i < ev->num; i++) { struct hci_ev_le_direct_adv_info *info = &ev->info[i]; + hci_store_wake_reason(hdev, &info->bdaddr, info->bdaddr_type); + process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, &info->direct_addr, info->direct_addr_type, HCI_ADV_PHY_1M, 0, @@ -7517,73 +7536,29 @@ static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, return true; } -static void hci_store_wake_reason(struct hci_dev *hdev, u8 event, - struct sk_buff *skb) +static void hci_store_wake_reason(struct hci_dev *hdev, + const bdaddr_t *bdaddr, u8 addr_type) + __must_hold(&hdev->lock) { - struct hci_ev_le_advertising_info *adv; - struct hci_ev_le_direct_adv_info *direct_adv; - struct hci_ev_le_ext_adv_info *ext_adv; - const struct hci_ev_conn_complete *conn_complete = (void *)skb->data; - const struct hci_ev_conn_request *conn_request = (void *)skb->data; - - hci_dev_lock(hdev); + lockdep_assert_held(&hdev->lock); /* If we are currently suspended and this is the first BT event seen, * save the wake reason associated with the event. */ if (!hdev->suspended || hdev->wake_reason) - goto unlock; + return; + + if (!bdaddr) { + hdev->wake_reason = MGMT_WAKE_REASON_UNEXPECTED; + return; + } /* Default to remote wake. Values for wake_reason are documented in the * Bluez mgmt api docs. */ hdev->wake_reason = MGMT_WAKE_REASON_REMOTE_WAKE; - - /* Once configured for remote wakeup, we should only wake up for - * reconnections. It's useful to see which device is waking us up so - * keep track of the bdaddr of the connection event that woke us up. - */ - if (event == HCI_EV_CONN_REQUEST) { - bacpy(&hdev->wake_addr, &conn_request->bdaddr); - hdev->wake_addr_type = BDADDR_BREDR; - } else if (event == HCI_EV_CONN_COMPLETE) { - bacpy(&hdev->wake_addr, &conn_complete->bdaddr); - hdev->wake_addr_type = BDADDR_BREDR; - } else if (event == HCI_EV_LE_META) { - struct hci_ev_le_meta *le_ev = (void *)skb->data; - u8 subevent = le_ev->subevent; - u8 *ptr = &skb->data[sizeof(*le_ev)]; - u8 num_reports = *ptr; - - if ((subevent == HCI_EV_LE_ADVERTISING_REPORT || - subevent == HCI_EV_LE_DIRECT_ADV_REPORT || - subevent == HCI_EV_LE_EXT_ADV_REPORT) && - num_reports) { - adv = (void *)(ptr + 1); - direct_adv = (void *)(ptr + 1); - ext_adv = (void *)(ptr + 1); - - switch (subevent) { - case HCI_EV_LE_ADVERTISING_REPORT: - bacpy(&hdev->wake_addr, &adv->bdaddr); - hdev->wake_addr_type = adv->bdaddr_type; - break; - case HCI_EV_LE_DIRECT_ADV_REPORT: - bacpy(&hdev->wake_addr, &direct_adv->bdaddr); - hdev->wake_addr_type = direct_adv->bdaddr_type; - break; - case HCI_EV_LE_EXT_ADV_REPORT: - bacpy(&hdev->wake_addr, &ext_adv->bdaddr); - hdev->wake_addr_type = ext_adv->bdaddr_type; - break; - } - } - } else { - hdev->wake_reason = MGMT_WAKE_REASON_UNEXPECTED; - } - -unlock: - hci_dev_unlock(hdev); + bacpy(&hdev->wake_addr, bdaddr); + hdev->wake_addr_type = addr_type; } #define HCI_EV_VL(_op, _func, _min_len, _max_len) \ @@ -7830,14 +7805,15 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) skb_pull(skb, HCI_EVENT_HDR_SIZE); - /* Store wake reason if we're suspended */ - hci_store_wake_reason(hdev, event, skb); - bt_dev_dbg(hdev, "event 0x%2.2x", event); hci_event_func(hdev, event, skb, &opcode, &status, &req_complete, &req_complete_skb); + hci_dev_lock(hdev); + hci_store_wake_reason(hdev, NULL, 0); + hci_dev_unlock(hdev); + if (req_complete) { req_complete(hdev, status, opcode); } else if (req_complete_skb) { -- cgit v1.2.3 From 2969554bcfccb5c609f6b6cd4a014933f3a66dd0 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Wed, 25 Mar 2026 21:07:43 +0200 Subject: Bluetooth: hci_sync: hci_cmd_sync_queue_once() return -EEXIST if exists hci_cmd_sync_queue_once() needs to indicate whether a queue item was added, so caller can know if callbacks are called, so it can avoid leaking resources. Change the function to return -EEXIST if queue item already exists. Modify all callsites to handle that. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 53 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 6283a4df78b0..97745710e3ce 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -780,7 +780,7 @@ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) - return 0; + return -EEXIST; return hci_cmd_sync_queue(hdev, func, data, destroy); } @@ -3262,6 +3262,8 @@ static int update_passive_scan_sync(struct hci_dev *hdev, void *data) int hci_update_passive_scan(struct hci_dev *hdev) { + int err; + /* Only queue if it would have any effect */ if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || @@ -3271,8 +3273,9 @@ int hci_update_passive_scan(struct hci_dev *hdev) hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; - return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, - NULL); + err = hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, + NULL); + return (err == -EEXIST) ? 0 : err; } int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val) @@ -6965,8 +6968,11 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn) { - return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, - NULL); + int err; + + err = hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, + NULL); + return (err == -EEXIST) ? 0 : err; } static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) @@ -7002,8 +7008,11 @@ done: int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn) { - return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, - create_le_conn_complete); + int err; + + err = hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, + create_le_conn_complete); + return (err == -EEXIST) ? 0 : err; } int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -7210,8 +7219,11 @@ done: int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn) { - return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, - create_pa_complete); + int err; + + err = hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, + create_pa_complete); + return (err == -EEXIST) ? 0 : err; } static void create_big_complete(struct hci_dev *hdev, void *data, int err) @@ -7273,8 +7285,11 @@ static int hci_le_big_create_sync(struct hci_dev *hdev, void *data) int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) { - return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, - create_big_complete); + int err; + + err = hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, + create_big_complete); + return (err == -EEXIST) ? 0 : err; } struct past_data { @@ -7366,7 +7381,7 @@ int hci_past_sync(struct hci_conn *conn, struct hci_conn *le) if (err) kfree(data); - return err; + return (err == -EEXIST) ? 0 : err; } static void le_read_features_complete(struct hci_dev *hdev, void *data, int err) @@ -7453,7 +7468,7 @@ int hci_le_read_remote_features(struct hci_conn *conn) else err = -EOPNOTSUPP; - return err; + return (err == -EEXIST) ? 0 : err; } static void pkt_type_changed(struct hci_dev *hdev, void *data, int err) @@ -7479,6 +7494,7 @@ int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type) { struct hci_dev *hdev = conn->hdev; struct hci_cp_change_conn_ptype *cp; + int err; cp = kmalloc_obj(*cp); if (!cp) @@ -7487,8 +7503,9 @@ int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type) cp->handle = cpu_to_le16(conn->handle); cp->pkt_type = cpu_to_le16(pkt_type); - return hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp, - pkt_type_changed); + err = hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp, + pkt_type_changed); + return (err == -EEXIST) ? 0 : err; } static void le_phy_update_complete(struct hci_dev *hdev, void *data, int err) @@ -7514,6 +7531,7 @@ int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_set_phy *cp; + int err; cp = kmalloc_obj(*cp); if (!cp) @@ -7524,6 +7542,7 @@ int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys) cp->tx_phys = tx_phys; cp->rx_phys = rx_phys; - return hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp, - le_phy_update_complete); + err = hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp, + le_phy_update_complete); + return (err == -EEXIST) ? 0 : err; } -- cgit v1.2.3 From aca377208e7f7322bf4e107cdec6e7d7e8aa7a88 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Wed, 25 Mar 2026 21:07:44 +0200 Subject: Bluetooth: hci_sync: fix leaks when hci_cmd_sync_queue_once fails When hci_cmd_sync_queue_once() returns with error, the destroy callback will not be called. Fix leaking references / memory on these failures. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 97745710e3ce..8cbbba50e77e 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7460,13 +7460,16 @@ int hci_le_read_remote_features(struct hci_conn *conn) * role is possible. Otherwise just transition into the * connected state without requesting the remote features. */ - if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) + if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { err = hci_cmd_sync_queue_once(hdev, hci_le_read_remote_features_sync, hci_conn_hold(conn), le_read_features_complete); - else + if (err) + hci_conn_drop(conn); + } else { err = -EOPNOTSUPP; + } return (err == -EEXIST) ? 0 : err; } @@ -7505,6 +7508,9 @@ int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type) err = hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp, pkt_type_changed); + if (err) + kfree(cp); + return (err == -EEXIST) ? 0 : err; } @@ -7544,5 +7550,8 @@ int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys) err = hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp, le_phy_update_complete); + if (err) + kfree(cp); + return (err == -EEXIST) ? 0 : err; } -- cgit v1.2.3 From 035c25007c9e698bef3826070ee34bb6d778020c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 25 Mar 2026 11:11:46 -0400 Subject: Bluetooth: hci_sync: Fix UAF in le_read_features_complete This fixes the following backtrace caused by hci_conn being freed before le_read_features_complete but after hci_le_read_remote_features_sync so hci_conn_del -> hci_cmd_sync_dequeue is not able to prevent it: ================================================================== BUG: KASAN: slab-use-after-free in instrument_atomic_read_write include/linux/instrumented.h:96 [inline] BUG: KASAN: slab-use-after-free in atomic_dec_and_test include/linux/atomic/atomic-instrumented.h:1383 [inline] BUG: KASAN: slab-use-after-free in hci_conn_drop include/net/bluetooth/hci_core.h:1688 [inline] BUG: KASAN: slab-use-after-free in le_read_features_complete+0x5b/0x340 net/bluetooth/hci_sync.c:7344 Write of size 4 at addr ffff8880796b0010 by task kworker/u9:0/52 CPU: 0 UID: 0 PID: 52 Comm: kworker/u9:0 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 Workqueue: hci0 hci_cmd_sync_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xcd/0x630 mm/kasan/report.c:482 kasan_report+0xe0/0x110 mm/kasan/report.c:595 check_region_inline mm/kasan/generic.c:194 [inline] kasan_check_range+0x100/0x1b0 mm/kasan/generic.c:200 instrument_atomic_read_write include/linux/instrumented.h:96 [inline] atomic_dec_and_test include/linux/atomic/atomic-instrumented.h:1383 [inline] hci_conn_drop include/net/bluetooth/hci_core.h:1688 [inline] le_read_features_complete+0x5b/0x340 net/bluetooth/hci_sync.c:7344 hci_cmd_sync_work+0x1ff/0x430 net/bluetooth/hci_sync.c:334 process_one_work+0x9ba/0x1b20 kernel/workqueue.c:3257 process_scheduled_works kernel/workqueue.c:3340 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3421 kthread+0x3c5/0x780 kernel/kthread.c:463 ret_from_fork+0x983/0xb10 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 Allocated by task 5932: kasan_save_stack+0x33/0x60 mm/kasan/common.c:56 kasan_save_track+0x14/0x30 mm/kasan/common.c:77 poison_kmalloc_redzone mm/kasan/common.c:400 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:417 kmalloc_noprof include/linux/slab.h:957 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] __hci_conn_add+0xf8/0x1c70 net/bluetooth/hci_conn.c:963 hci_conn_add_unset+0x76/0x100 net/bluetooth/hci_conn.c:1084 le_conn_complete_evt+0x639/0x1f20 net/bluetooth/hci_event.c:5714 hci_le_enh_conn_complete_evt+0x23d/0x380 net/bluetooth/hci_event.c:5861 hci_le_meta_evt+0x357/0x5e0 net/bluetooth/hci_event.c:7408 hci_event_func net/bluetooth/hci_event.c:7716 [inline] hci_event_packet+0x685/0x11c0 net/bluetooth/hci_event.c:7773 hci_rx_work+0x2c9/0xeb0 net/bluetooth/hci_core.c:4076 process_one_work+0x9ba/0x1b20 kernel/workqueue.c:3257 process_scheduled_works kernel/workqueue.c:3340 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3421 kthread+0x3c5/0x780 kernel/kthread.c:463 ret_from_fork+0x983/0xb10 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 Freed by task 5932: kasan_save_stack+0x33/0x60 mm/kasan/common.c:56 kasan_save_track+0x14/0x30 mm/kasan/common.c:77 __kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:587 kasan_save_free_info mm/kasan/kasan.h:406 [inline] poison_slab_object mm/kasan/common.c:252 [inline] __kasan_slab_free+0x5f/0x80 mm/kasan/common.c:284 kasan_slab_free include/linux/kasan.h:234 [inline] slab_free_hook mm/slub.c:2540 [inline] slab_free mm/slub.c:6663 [inline] kfree+0x2f8/0x6e0 mm/slub.c:6871 device_release+0xa4/0x240 drivers/base/core.c:2565 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x1e7/0x590 lib/kobject.c:737 put_device drivers/base/core.c:3797 [inline] device_unregister+0x2f/0xc0 drivers/base/core.c:3920 hci_conn_del_sysfs+0xb4/0x180 net/bluetooth/hci_sysfs.c:79 hci_conn_cleanup net/bluetooth/hci_conn.c:173 [inline] hci_conn_del+0x657/0x1180 net/bluetooth/hci_conn.c:1234 hci_disconn_complete_evt+0x410/0xa00 net/bluetooth/hci_event.c:3451 hci_event_func net/bluetooth/hci_event.c:7719 [inline] hci_event_packet+0xa10/0x11c0 net/bluetooth/hci_event.c:7773 hci_rx_work+0x2c9/0xeb0 net/bluetooth/hci_core.c:4076 process_one_work+0x9ba/0x1b20 kernel/workqueue.c:3257 process_scheduled_works kernel/workqueue.c:3340 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3421 kthread+0x3c5/0x780 kernel/kthread.c:463 ret_from_fork+0x983/0xb10 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:246 The buggy address belongs to the object at ffff8880796b0000 which belongs to the cache kmalloc-8k of size 8192 The buggy address is located 16 bytes inside of freed 8192-byte region [ffff8880796b0000, ffff8880796b2000) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x796b0 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 anon flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 00fff00000000040 ffff88813ff27280 0000000000000000 0000000000000001 raw: 0000000000000000 0000000000020002 00000000f5000000 0000000000000000 head: 00fff00000000040 ffff88813ff27280 0000000000000000 0000000000000001 head: 0000000000000000 0000000000020002 00000000f5000000 0000000000000000 head: 00fff00000000003 ffffea0001e5ac01 00000000ffffffff 00000000ffffffff head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000008 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd2040(__GFP_IO|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 5657, tgid 5657 (dhcpcd-run-hook), ts 79819636908, free_ts 79814310558 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1af/0x220 mm/page_alloc.c:1845 prep_new_page mm/page_alloc.c:1853 [inline] get_page_from_freelist+0xd0b/0x31a0 mm/page_alloc.c:3879 __alloc_frozen_pages_noprof+0x25f/0x2440 mm/page_alloc.c:5183 alloc_pages_mpol+0x1fb/0x550 mm/mempolicy.c:2416 alloc_slab_page mm/slub.c:3075 [inline] allocate_slab mm/slub.c:3248 [inline] new_slab+0x2c3/0x430 mm/slub.c:3302 ___slab_alloc+0xe18/0x1c90 mm/slub.c:4651 __slab_alloc.constprop.0+0x63/0x110 mm/slub.c:4774 __slab_alloc_node mm/slub.c:4850 [inline] slab_alloc_node mm/slub.c:5246 [inline] __kmalloc_cache_noprof+0x477/0x800 mm/slub.c:5766 kmalloc_noprof include/linux/slab.h:957 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] tomoyo_print_bprm security/tomoyo/audit.c:26 [inline] tomoyo_init_log+0xc8a/0x2140 security/tomoyo/audit.c:264 tomoyo_supervisor+0x302/0x13b0 security/tomoyo/common.c:2198 tomoyo_audit_env_log security/tomoyo/environ.c:36 [inline] tomoyo_env_perm+0x191/0x200 security/tomoyo/environ.c:63 tomoyo_environ security/tomoyo/domain.c:672 [inline] tomoyo_find_next_domain+0xec1/0x20b0 security/tomoyo/domain.c:888 tomoyo_bprm_check_security security/tomoyo/tomoyo.c:102 [inline] tomoyo_bprm_check_security+0x12d/0x1d0 security/tomoyo/tomoyo.c:92 security_bprm_check+0x1b9/0x1e0 security/security.c:794 search_binary_handler fs/exec.c:1659 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve fs/exec.c:1753 [inline] bprm_execve+0x81e/0x1620 fs/exec.c:1729 do_execveat_common.isra.0+0x4a5/0x610 fs/exec.c:1859 page last free pid 5657 tgid 5657 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1394 [inline] __free_frozen_pages+0x7df/0x1160 mm/page_alloc.c:2901 discard_slab mm/slub.c:3346 [inline] __put_partials+0x130/0x170 mm/slub.c:3886 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x4c/0xf0 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x195/0x1e0 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x69/0x90 mm/kasan/common.c:352 kasan_slab_alloc include/linux/kasan.h:252 [inline] slab_post_alloc_hook mm/slub.c:4948 [inline] slab_alloc_node mm/slub.c:5258 [inline] __kmalloc_cache_noprof+0x274/0x800 mm/slub.c:5766 kmalloc_noprof include/linux/slab.h:957 [inline] tomoyo_print_header security/tomoyo/audit.c:156 [inline] tomoyo_init_log+0x197/0x2140 security/tomoyo/audit.c:255 tomoyo_supervisor+0x302/0x13b0 security/tomoyo/common.c:2198 tomoyo_audit_env_log security/tomoyo/environ.c:36 [inline] tomoyo_env_perm+0x191/0x200 security/tomoyo/environ.c:63 tomoyo_environ security/tomoyo/domain.c:672 [inline] tomoyo_find_next_domain+0xec1/0x20b0 security/tomoyo/domain.c:888 tomoyo_bprm_check_security security/tomoyo/tomoyo.c:102 [inline] tomoyo_bprm_check_security+0x12d/0x1d0 security/tomoyo/tomoyo.c:92 security_bprm_check+0x1b9/0x1e0 security/security.c:794 search_binary_handler fs/exec.c:1659 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve fs/exec.c:1753 [inline] bprm_execve+0x81e/0x1620 fs/exec.c:1729 do_execveat_common.isra.0+0x4a5/0x610 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x8e/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0xf80 arch/x86/entry/syscall_64.c:94 Memory state around the buggy address: ffff8880796aff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8880796aff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8880796b0000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880796b0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880796b0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Fixes: a106e50be74b ("Bluetooth: HCI: Add support for LL Extended Feature Set") Reported-by: syzbot+87badbb9094e008e0685@syzkaller.appspotmail.com Tested-by: syzbot+87badbb9094e008e0685@syzkaller.appspotmail.com Closes: https://syzbot.org/bug?extid=87badbb9094e008e0685 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Pauli Virtanen --- net/bluetooth/hci_sync.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8cbbba50e77e..ffb0ceda6f7b 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7390,10 +7390,8 @@ static void le_read_features_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED) - return; - hci_conn_drop(conn); + hci_conn_put(conn); } static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev, @@ -7463,10 +7461,12 @@ int hci_le_read_remote_features(struct hci_conn *conn) if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { err = hci_cmd_sync_queue_once(hdev, hci_le_read_remote_features_sync, - hci_conn_hold(conn), + hci_conn_hold(hci_conn_get(conn)), le_read_features_complete); - if (err) + if (err) { hci_conn_drop(conn); + hci_conn_put(conn); + } } else { err = -EOPNOTSUPP; } -- cgit v1.2.3 From b8dbe9648d69059cfe3a28917bfbf7e61efd7f15 Mon Sep 17 00:00:00 2001 From: Keenan Dong Date: Sat, 28 Mar 2026 16:46:47 +0800 Subject: Bluetooth: MGMT: validate LTK enc_size on load Load Long Term Keys stores the user-provided enc_size and later uses it to size fixed-size stack operations when replying to LE LTK requests. An enc_size larger than the 16-byte key buffer can therefore overflow the reply stack buffer. Reject oversized enc_size values while validating the management LTK record so invalid keys never reach the stored key state. Fixes: 346af67b8d11 ("Bluetooth: Add MGMT handlers for dealing with SMP LTK's") Reported-by: Keenan Dong Signed-off-by: Keenan Dong Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index e5f9287fb826..adcd86c15b4e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7248,6 +7248,9 @@ static bool ltk_is_valid(struct mgmt_ltk_info *key) if (key->initiator != 0x00 && key->initiator != 0x01) return false; + if (key->enc_size > sizeof(key->val)) + return false; + switch (key->addr.type) { case BDADDR_LE_PUBLIC: return true; -- cgit v1.2.3 From a2639a7f0f5bf7d73f337f8f077c19415c62ed2c Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 29 Mar 2026 16:43:01 +0300 Subject: Bluetooth: hci_conn: fix potential UAF in set_cig_params_sync hci_conn lookup and field access must be covered by hdev lock in set_cig_params_sync, otherwise it's possible it is freed concurrently. Take hdev lock to prevent hci_conn from being deleted or modified concurrently. Just RCU lock is not suitable here, as we also want to avoid "tearing" in the configuration. Fixes: a091289218202 ("Bluetooth: hci_conn: Fix hci_le_set_cig_params") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e6393f17576b..11d3ad8d2551 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1843,9 +1843,13 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) u8 aux_num_cis = 0; u8 cis_id; + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_cig(hdev, cig_id); - if (!conn) + if (!conn) { + hci_dev_unlock(hdev); return 0; + } qos = &conn->iso_qos; pdu->cig_id = cig_id; @@ -1884,6 +1888,8 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) } pdu->num_cis = aux_num_cis; + hci_dev_unlock(hdev); + if (!pdu->num_cis) return 0; -- cgit v1.2.3 From b255531b27da336571411248c2a72a350662bd09 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 29 Mar 2026 16:43:02 +0300 Subject: Bluetooth: hci_event: fix potential UAF in hci_le_remote_conn_param_req_evt hci_conn lookup and field access must be covered by hdev lock in hci_le_remote_conn_param_req_evt, otherwise it's possible it is freed concurrently. Extend the hci_dev_lock critical section to cover all conn usage. Fixes: 95118dd4edfec ("Bluetooth: hci_event: Use of a function table to handle LE subevents") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 81d2f9a3eec9..3ebc5e6d45d9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6784,25 +6784,31 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data, latency = le16_to_cpu(ev->latency); timeout = le16_to_cpu(ev->timeout); + hci_dev_lock(hdev); + hcon = hci_conn_hash_lookup_handle(hdev, handle); - if (!hcon || hcon->state != BT_CONNECTED) - return send_conn_param_neg_reply(hdev, handle, - HCI_ERROR_UNKNOWN_CONN_ID); + if (!hcon || hcon->state != BT_CONNECTED) { + send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_UNKNOWN_CONN_ID); + goto unlock; + } - if (max > hcon->le_conn_max_interval) - return send_conn_param_neg_reply(hdev, handle, - HCI_ERROR_INVALID_LL_PARAMS); + if (max > hcon->le_conn_max_interval) { + send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + goto unlock; + } - if (hci_check_conn_params(min, max, latency, timeout)) - return send_conn_param_neg_reply(hdev, handle, - HCI_ERROR_INVALID_LL_PARAMS); + if (hci_check_conn_params(min, max, latency, timeout)) { + send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + goto unlock; + } if (hcon->role == HCI_ROLE_MASTER) { struct hci_conn_params *params; u8 store_hint; - hci_dev_lock(hdev); - params = hci_conn_params_lookup(hdev, &hcon->dst, hcon->dst_type); if (params) { @@ -6815,8 +6821,6 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data, store_hint = 0x00; } - hci_dev_unlock(hdev); - mgmt_new_conn_param(hdev, &hcon->dst, hcon->dst_type, store_hint, min, max, latency, timeout); } @@ -6830,6 +6834,9 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data, cp.max_ce_len = 0; hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_REPLY, sizeof(cp), &cp); + +unlock: + hci_dev_unlock(hdev); } static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data, -- cgit v1.2.3 From bda93eec78cdbfe5cda00785cefebd443e56b88b Mon Sep 17 00:00:00 2001 From: Keenan Dong Date: Wed, 1 Apr 2026 22:25:26 +0800 Subject: Bluetooth: MGMT: validate mesh send advertising payload length mesh_send() currently bounds MGMT_OP_MESH_SEND by total command length, but it never verifies that the bytes supplied for the flexible adv_data[] array actually match the embedded adv_data_len field. MGMT_MESH_SEND_SIZE only covers the fixed header, so a truncated command can still pass the existing 20..50 byte range check and later drive the async mesh send path past the end of the queued command buffer. Keep rejecting zero-length and oversized advertising payloads, but validate adv_data_len explicitly and require the command length to exactly match the flexible array size before queueing the request. Fixes: b338d91703fa ("Bluetooth: Implement support for Mesh") Reported-by: Keenan Dong Signed-off-by: Keenan Dong Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index adcd86c15b4e..b05bb380e5f8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2478,6 +2478,7 @@ static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) struct mgmt_mesh_tx *mesh_tx; struct mgmt_cp_mesh_send *send = data; struct mgmt_rp_mesh_read_features rp; + u16 expected_len; bool sending; int err = 0; @@ -2485,12 +2486,19 @@ static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_NOT_SUPPORTED); - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || - len <= MGMT_MESH_SEND_SIZE || - len > (MGMT_MESH_SEND_SIZE + 31)) + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_REJECTED); + + if (!send->adv_data_len || send->adv_data_len > 31) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_REJECTED); + expected_len = struct_size(send, adv_data, send->adv_data_len); + if (expected_len != len) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); memset(&rp, 0, sizeof(rp)); -- cgit v1.2.3 From d05111bfe37bfd8bd4d2dfe6675d6bdeef43f7c7 Mon Sep 17 00:00:00 2001 From: Oleh Konko Date: Tue, 31 Mar 2026 11:52:12 +0000 Subject: Bluetooth: SMP: force responder MITM requirements before building the pairing response smp_cmd_pairing_req() currently builds the pairing response from the initiator auth_req before enforcing the local BT_SECURITY_HIGH requirement. If the initiator omits SMP_AUTH_MITM, the response can also omit it even though the local side still requires MITM. tk_request() then sees an auth value without SMP_AUTH_MITM and may select JUST_CFM, making method selection inconsistent with the pairing policy the responder already enforces. When the local side requires HIGH security, first verify that MITM can be achieved from the IO capabilities and then force SMP_AUTH_MITM in the response in both rsp.auth_req and auth. This keeps the responder auth bits and later method selection aligned. Fixes: 2b64d153a0cc ("Bluetooth: Add MITM mechanism to LE-SMP") Cc: stable@vger.kernel.org Suggested-by: Luiz Augusto von Dentz Signed-off-by: Oleh Konko Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/smp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 485e3468bd26..deb8dd244b77 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1826,7 +1826,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; - /* If we need MITM check that it can be achieved */ + /* If we need MITM check that it can be achieved. */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; @@ -1834,6 +1834,10 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) req->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; + + /* Force MITM bit if it isn't set by the initiator. */ + auth |= SMP_AUTH_MITM; + rsp.auth_req |= SMP_AUTH_MITM; } key_size = min(req->max_key_size, rsp.max_key_size); -- cgit v1.2.3 From 20756fec2f0108cb88e815941f1ffff88dc286fe Mon Sep 17 00:00:00 2001 From: Oleh Konko Date: Tue, 31 Mar 2026 11:52:13 +0000 Subject: Bluetooth: SMP: derive legacy responder STK authentication from MITM state The legacy responder path in smp_random() currently labels the stored STK as authenticated whenever pending_sec_level is BT_SECURITY_HIGH. That reflects what the local service requested, not what the pairing flow actually achieved. For Just Works/Confirm legacy pairing, SMP_FLAG_MITM_AUTH stays clear and the resulting STK should remain unauthenticated even if the local side requested HIGH security. Use the established MITM state when storing the responder STK so the key metadata matches the pairing result. This also keeps the legacy path aligned with the Secure Connections code, which already treats JUST_WORKS/JUST_CFM as unauthenticated. Fixes: fff3490f4781 ("Bluetooth: Fix setting correct authentication information for SMP STK") Cc: stable@vger.kernel.org Signed-off-by: Oleh Konko Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/smp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index deb8dd244b77..98f1da4f5f55 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1018,10 +1018,7 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tk, smp->prnd, smp->rrnd, stk); - if (hcon->pending_sec_level == BT_SECURITY_HIGH) - auth = 1; - else - auth = 0; + auth = test_bit(SMP_FLAG_MITM_AUTH, &smp->flags) ? 1 : 0; /* Even though there's no _RESPONDER suffix this is the * responder STK we're adding for later lookup (the initiator -- cgit v1.2.3 From bc39a094730ce062fa034a529c93147c096cb488 Mon Sep 17 00:00:00 2001 From: hkbinbin Date: Tue, 31 Mar 2026 05:39:16 +0000 Subject: Bluetooth: hci_sync: fix stack buffer overflow in hci_le_big_create_sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hci_le_big_create_sync() uses DEFINE_FLEX to allocate a struct hci_cp_le_big_create_sync on the stack with room for 0x11 (17) BIS entries. However, conn->num_bis can hold up to HCI_MAX_ISO_BIS (31) entries — validated against ISO_MAX_NUM_BIS (0x1f) in the caller hci_conn_big_create_sync(). When conn->num_bis is between 18 and 31, the memcpy that copies conn->bis into cp->bis writes up to 14 bytes past the stack buffer, corrupting adjacent stack memory. This is trivially reproducible: binding an ISO socket with bc_num_bis = ISO_MAX_NUM_BIS (31) and calling listen() will eventually trigger hci_le_big_create_sync() from the HCI command sync worker, causing a KASAN-detectable stack-out-of-bounds write: BUG: KASAN: stack-out-of-bounds in hci_le_big_create_sync+0x256/0x3b0 Write of size 31 at addr ffffc90000487b48 by task kworker/u9:0/71 Fix this by changing the DEFINE_FLEX count from the incorrect 0x11 to HCI_MAX_ISO_BIS, which matches the maximum number of BIS entries that conn->bis can actually carry. Fixes: 42ecf1947135 ("Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending") Cc: stable@vger.kernel.org Signed-off-by: hkbinbin Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index ffb0ceda6f7b..919ec275dd23 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7241,7 +7241,8 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) static int hci_le_big_create_sync(struct hci_dev *hdev, void *data) { - DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11); + DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, + HCI_MAX_ISO_BIS); struct hci_conn *conn = data; struct bt_iso_qos *qos = &conn->iso_qos; int err; -- cgit v1.2.3 From ffb5a4843c5bde702ed17cbcdbda98b37f7a6dad Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 31 Mar 2026 12:17:18 +0800 Subject: ipv6: fix data race in fib6_metric_set() using cmpxchg fib6_metric_set() may be called concurrently from softirq context without holding the FIB table lock. A typical path is: ndisc_router_discovery() spin_unlock_bh(&table->tb6_lock) <- lock released fib6_metric_set(rt, RTAX_HOPLIMIT, ...) <- lockless call When two CPUs process Router Advertisement packets for the same router simultaneously, they can both arrive at fib6_metric_set() with the same fib6_info pointer whose fib6_metrics still points to dst_default_metrics. if (f6i->fib6_metrics == &dst_default_metrics) { /* both CPUs: true */ struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC); refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; /* CPU1 overwrites CPU0's p -> p0 leaked */ } The dst_metrics allocated by the losing CPU has refcnt=1 but no pointer to it anywhere in memory, producing a kmemleak report: unreferenced object 0xff1100025aca1400 (size 96): comm "softirq", pid 0, jiffies 4299271239 backtrace: kmalloc_trace+0x28a/0x380 fib6_metric_set+0xcd/0x180 ndisc_router_discovery+0x12dc/0x24b0 icmpv6_rcv+0xc16/0x1360 Fix this by: - Set val for p->metrics before published via cmpxchg() so the metrics value is ready before the pointer becomes visible to other CPUs. - Replace the plain pointer store with cmpxchg() and free the allocation safely when competition failed. - Add READ_ONCE()/WRITE_ONCE() for metrics[] setting in the non-default metrics path to prevent compiler-based data races. Fixes: d4ead6b34b67 ("net/ipv6: move metrics from dst to rt6_info") Reported-by: Fei Liu Reviewed-by: Jiayuan Chen Signed-off-by: Hangbin Liu Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260331-b4-fib6_metric_set-kmemleak-v3-1-88d27f4d8825@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index dd26657b6a4a..45ef4d65dcbc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -727,20 +727,28 @@ unlock: void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { + struct dst_metrics *m; + if (!f6i) return; - if (f6i->fib6_metrics == &dst_default_metrics) { + if (READ_ONCE(f6i->fib6_metrics) == &dst_default_metrics) { + struct dst_metrics *dflt = (struct dst_metrics *)&dst_default_metrics; struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC); if (!p) return; + p->metrics[metric - 1] = val; refcount_set(&p->refcnt, 1); - f6i->fib6_metrics = p; + if (cmpxchg(&f6i->fib6_metrics, dflt, p) != dflt) + kfree(p); + else + return; } - f6i->fib6_metrics->metrics[metric - 1] = val; + m = READ_ONCE(f6i->fib6_metrics); + WRITE_ONCE(m->metrics[metric - 1], val); } /* -- cgit v1.2.3 From a54ecccfae62c5c85259ae5ea5d9c20009519049 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Tue, 31 Mar 2026 00:32:38 +0800 Subject: rds: ib: reject FRMR registration before IB connection is established rds_ib_get_mr() extracts the rds_ib_connection from conn->c_transport_data and passes it to rds_ib_reg_frmr() for FRWR memory registration. On a fresh outgoing connection, ic is allocated in rds_ib_conn_alloc() with i_cm_id = NULL because the connection worker has not yet called rds_ib_conn_path_connect() to create the rdma_cm_id. When sendmsg() with RDS_CMSG_RDMA_MAP is called on such a connection, the sendmsg path parses the control message before any connection establishment, allowing rds_ib_post_reg_frmr() to dereference ic->i_cm_id->qp and crash the kernel. The existing guard in rds_ib_reg_frmr() only checks for !ic (added in commit 9e630bcb7701), which does not catch this case since ic is allocated early and is always non-NULL once the connection object exists. KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:rds_ib_post_reg_frmr+0x50e/0x920 Call Trace: rds_ib_post_reg_frmr (net/rds/ib_frmr.c:167) rds_ib_map_frmr (net/rds/ib_frmr.c:252) rds_ib_reg_frmr (net/rds/ib_frmr.c:430) rds_ib_get_mr (net/rds/ib_rdma.c:615) __rds_rdma_map (net/rds/rdma.c:295) rds_cmsg_rdma_map (net/rds/rdma.c:860) rds_sendmsg (net/rds/send.c:1363) ____sys_sendmsg do_syscall_64 Add a check in rds_ib_get_mr() that verifies ic, i_cm_id, and qp are all non-NULL before proceeding with FRMR registration, mirroring the guard already present in rds_ib_post_inv(). Return -ENODEV when the connection is not ready, which the existing error handling in rds_cmsg_send() converts to -EAGAIN for userspace retry and triggers rds_conn_connect_if_down() to start the connection worker. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20260330163237.2752440-2-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/ib_rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 077f7041df15..2cfec252eeac 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -604,8 +604,13 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, return ibmr; } - if (conn) + if (conn) { ic = conn->c_transport_data; + if (!ic || !ic->i_cm_id || !ic->i_cm_id->qp) { + ret = -ENODEV; + goto out; + } + } if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; -- cgit v1.2.3 From ad8391d37f334ee73ba91926f8b4e4cf6d31ea04 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 1 Apr 2026 00:54:15 +0000 Subject: bpf: sockmap: Fix use-after-free of sk->sk_socket in sk_psock_verdict_data_ready(). syzbot reported use-after-free of AF_UNIX socket's sk->sk_socket in sk_psock_verdict_data_ready(). [0] In unix_stream_sendmsg(), the peer socket's ->sk_data_ready() is called after dropping its unix_state_lock(). Although the sender socket holds the peer's refcount, it does not prevent the peer's sock_orphan(), and the peer's sk_socket might be freed after one RCU grace period. Let's fetch the peer's sk->sk_socket and sk->sk_socket->ops under RCU in sk_psock_verdict_data_ready(). [0]: BUG: KASAN: slab-use-after-free in sk_psock_verdict_data_ready+0xec/0x590 net/core/skmsg.c:1278 Read of size 8 at addr ffff8880594da860 by task syz.4.1842/11013 CPU: 1 UID: 0 PID: 11013 Comm: syz.4.1842 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xba/0x230 mm/kasan/report.c:482 kasan_report+0x117/0x150 mm/kasan/report.c:595 sk_psock_verdict_data_ready+0xec/0x590 net/core/skmsg.c:1278 unix_stream_sendmsg+0x8a3/0xe80 net/unix/af_unix.c:2482 sock_sendmsg_nosec net/socket.c:721 [inline] __sock_sendmsg net/socket.c:736 [inline] ____sys_sendmsg+0x972/0x9f0 net/socket.c:2585 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2639 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2674 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7facf899c819 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007facf9827028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007facf8c15fa0 RCX: 00007facf899c819 RDX: 0000000000000000 RSI: 0000200000000500 RDI: 0000000000000004 RBP: 00007facf8a32c91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007facf8c16038 R14: 00007facf8c15fa0 R15: 00007ffd41b01c78 Allocated by task 11013: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 unpoison_slab_object mm/kasan/common.c:340 [inline] __kasan_slab_alloc+0x6c/0x80 mm/kasan/common.c:366 kasan_slab_alloc include/linux/kasan.h:253 [inline] slab_post_alloc_hook mm/slub.c:4538 [inline] slab_alloc_node mm/slub.c:4866 [inline] kmem_cache_alloc_lru_noprof+0x2b8/0x640 mm/slub.c:4885 sock_alloc_inode+0x28/0xc0 net/socket.c:316 alloc_inode+0x6a/0x1b0 fs/inode.c:347 new_inode_pseudo include/linux/fs.h:3003 [inline] sock_alloc net/socket.c:631 [inline] __sock_create+0x12d/0x9d0 net/socket.c:1562 sock_create net/socket.c:1656 [inline] __sys_socketpair+0x1c4/0x560 net/socket.c:1803 __do_sys_socketpair net/socket.c:1856 [inline] __se_sys_socketpair net/socket.c:1853 [inline] __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1853 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 15: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 poison_slab_object mm/kasan/common.c:253 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:2685 [inline] slab_free mm/slub.c:6165 [inline] kmem_cache_free+0x187/0x630 mm/slub.c:6295 rcu_do_batch kernel/rcu/tree.c:2617 [inline] rcu_core+0x7cd/0x1070 kernel/rcu/tree.c:2869 handle_softirqs+0x22a/0x870 kernel/softirq.c:622 run_ksoftirqd+0x36/0x60 kernel/softirq.c:1063 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x51e/0xb90 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Fixes: c63829182c37 ("af_unix: Implement ->psock_update_sk_prot()") Closes: https://lore.kernel.org/bpf/69cc6b9f.a70a0220.128fd0.004b.GAE@google.com/ Reported-by: syzbot+2184232f07e3677fbaef@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260401005418.2452999-1-kuniyu@google.com --- net/core/skmsg.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3261793abe83..6187a83bd741 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1267,17 +1267,20 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { - struct socket *sock = sk->sk_socket; - const struct proto_ops *ops; + const struct proto_ops *ops = NULL; + struct socket *sock; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock)) - return; - ops = READ_ONCE(sock->ops); + rcu_read_lock(); + sock = READ_ONCE(sk->sk_socket); + if (likely(sock)) + ops = READ_ONCE(sock->ops); + rcu_read_unlock(); if (!ops || !ops->read_skb) return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; -- cgit v1.2.3 From d64cb81dcbd54927515a7f65e5e24affdc73c14b Mon Sep 17 00:00:00 2001 From: Yucheng Lu Date: Tue, 31 Mar 2026 16:00:21 +0800 Subject: net/sched: sch_netem: fix out-of-bounds access in packet corruption In netem_enqueue(), the packet corruption logic uses get_random_u32_below(skb_headlen(skb)) to select an index for modifying skb->data. When an AF_PACKET TX_RING sends fully non-linear packets over an IPIP tunnel, skb_headlen(skb) evaluates to 0. Passing 0 to get_random_u32_below() takes the variable-ceil slow path which returns an unconstrained 32-bit random integer. Using this unconstrained value as an offset into skb->data results in an out-of-bounds memory access. Fix this by verifying skb_headlen(skb) is non-zero before attempting to corrupt the linear data area. Fully non-linear packets will silently bypass the corruption logic. Fixes: c865e5d99e25 ("[PKT_SCHED] netem: packet corruption option") Reported-by: Yifan Wu Reported-by: Juefei Pu Signed-off-by: Yuan Tan Signed-off-by: Xin Liu Signed-off-by: Yuhang Zheng Signed-off-by: Yucheng Lu Reviewed-by: Stephen Hemminger Link: https://patch.msgid.link/45435c0935df877853a81e6d06205ac738ec65fa.1774941614.git.kanolyc@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5de1c932944a..20df1c08b1e9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -519,8 +519,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto finish_segs; } - skb->data[get_random_u32_below(skb_headlen(skb))] ^= - 1<data[get_random_u32_below(skb_headlen(skb))] ^= + 1 << get_random_u32_below(8); } if (unlikely(q->t_len >= sch->limit)) { -- cgit v1.2.3 From d10a26aa4d072320530e6968ef945c8c575edf61 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Tue, 31 Mar 2026 09:43:17 +0200 Subject: net/x25: Fix potential double free of skb When alloc_skb fails in x25_queue_rx_frame it calls kfree_skb(skb) at line 48 and returns 1 (error). This error propagates back through the call chain: x25_queue_rx_frame returns 1 | v x25_state3_machine receives the return value 1 and takes the else branch at line 278, setting queued=0 and returning 0 | v x25_process_rx_frame returns queued=0 | v x25_backlog_rcv at line 452 sees queued=0 and calls kfree_skb(skb) again This would free the same skb twice. Looking at x25_backlog_rcv: net/x25/x25_in.c:x25_backlog_rcv() { ... queued = x25_process_rx_frame(sk, skb); ... if (!queued) kfree_skb(skb); } Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Martin Schiller Link: https://patch.msgid.link/20260331-x25_fraglen-v4-1-3e69f18464b4@dev.tdt.de Signed-off-by: Paolo Abeni --- net/x25/x25_in.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index b981a4828d08..0dbc73efab1c 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -44,10 +44,9 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) if (x25->fraglen > 0) { /* End of fragment */ int len = x25->fraglen + skb->len; - if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL){ - kfree_skb(skb); + skbn = alloc_skb(len, GFP_ATOMIC); + if (!skbn) return 1; - } skb_queue_tail(&x25->fragment_queue, skb); -- cgit v1.2.3 From a1822cb524e89b4cd2cf0b82e484a2335496a6d9 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Tue, 31 Mar 2026 09:43:18 +0200 Subject: net/x25: Fix overflow when accumulating packets Add a check to ensure that `x25_sock.fraglen` does not overflow. The `fraglen` also needs to be resetted when purging `fragment_queue` in `x25_clear_queues()`. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Yiming Qian Signed-off-by: Martin Schiller Link: https://patch.msgid.link/20260331-x25_fraglen-v4-2-3e69f18464b4@dev.tdt.de Signed-off-by: Paolo Abeni --- net/x25/x25_in.c | 4 ++++ net/x25/x25_subr.c | 1 + 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 0dbc73efab1c..e47ebd8acd21 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -34,6 +34,10 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) struct sk_buff *skbo, *skbn = skb; struct x25_sock *x25 = x25_sk(sk); + /* make sure we don't overflow */ + if (x25->fraglen + skb->len > USHRT_MAX) + return 1; + if (more) { x25->fraglen += skb->len; skb_queue_tail(&x25->fragment_queue, skb); diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 0285aaa1e93c..159708d9ad20 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -40,6 +40,7 @@ void x25_clear_queues(struct sock *sk) skb_queue_purge(&x25->interrupt_in_queue); skb_queue_purge(&x25->interrupt_out_queue); skb_queue_purge(&x25->fragment_queue); + x25->fraglen = 0; } -- cgit v1.2.3 From faeea8bbf6e958bf3c00cb08263109661975987c Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Mon, 30 Mar 2026 22:02:15 -0700 Subject: net/sched: cls_fw: fix NULL pointer dereference on shared blocks The old-method path in fw_classify() calls tcf_block_q() and dereferences q->handle. Shared blocks leave block->q NULL, causing a NULL deref when an empty cls_fw filter is attached to a shared block and a packet with a nonzero major skb mark is classified. Reject the configuration in fw_change() when the old method (no TCA_OPTIONS) is used on a shared block, since fw_classify()'s old-method path needs block->q which is NULL for shared blocks. The fixed null-ptr-deref calling stack: KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f] RIP: 0010:fw_classify (net/sched/cls_fw.c:81) Call Trace: tcf_classify (./include/net/tc_wrapper.h:197 net/sched/cls_api.c:1764 net/sched/cls_api.c:1860) tc_run (net/core/dev.c:4401) __dev_queue_xmit (net/core/dev.c:4535 net/core/dev.c:4790) Fixes: 1abf272022cf ("net: sched: tcindex, fw, flow: use tcf_block_q helper to get struct Qdisc") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260331050217.504278-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/sched/cls_fw.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index be81c108179d..23884ef8b80c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -247,8 +247,18 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_FW_MAX + 1]; int err; - if (!opt) - return handle ? -EINVAL : 0; /* Succeed if it is old method. */ + if (!opt) { + if (handle) + return -EINVAL; + + if (tcf_block_shared(tp->chain->block)) { + NL_SET_ERR_MSG(extack, + "Must specify mark when attaching fw filter to block"); + return -EINVAL; + } + + return 0; /* Succeed if it is old method. */ + } err = nla_parse_nested_deprecated(tb, TCA_FW_MAX, opt, fw_policy, NULL); -- cgit v1.2.3 From 1a280dd4bd1d616a01d6ffe0de284c907b555504 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Mon, 30 Mar 2026 22:02:16 -0700 Subject: net/sched: cls_flow: fix NULL pointer dereference on shared blocks flow_change() calls tcf_block_q() and dereferences q->handle to derive a default baseclass. Shared blocks leave block->q NULL, causing a NULL deref when a flow filter without a fully qualified baseclass is created on a shared block. Check tcf_block_shared() before accessing block->q and return -EINVAL for shared blocks. This avoids the null-deref shown below: ======================================================================= KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f] RIP: 0010:flow_change (net/sched/cls_flow.c:508) Call Trace: tc_new_tfilter (net/sched/cls_api.c:2432) rtnetlink_rcv_msg (net/core/rtnetlink.c:6980) [...] ======================================================================= Fixes: 1abf272022cf ("net: sched: tcindex, fw, flow: use tcf_block_q helper to get struct Qdisc") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260331050217.504278-2-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/sched/cls_flow.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 339c664beff6..ab364e4e4686 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -503,8 +503,16 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, } if (TC_H_MAJ(baseclass) == 0) { - struct Qdisc *q = tcf_block_q(tp->chain->block); + struct tcf_block *block = tp->chain->block; + struct Qdisc *q; + if (tcf_block_shared(block)) { + NL_SET_ERR_MSG(extack, + "Must specify baseclass when attaching flow filter to block"); + goto err2; + } + + q = tcf_block_q(block); baseclass = TC_H_MAKE(q->handle, baseclass); } if (TC_H_MIN(baseclass) == 0) -- cgit v1.2.3 From b18c833888742ca9de80c250f9d40d0e97caa9f6 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 1 Apr 2026 11:21:53 +0200 Subject: vsock: initialize child_ns_mode_locked in vsock_net_init() The `child_ns_mode_locked` field lives in `struct net`, which persists across vsock module reloads. When the module is unloaded and reloaded, `vsock_net_init()` resets `mode` and `child_ns_mode` back to their default values, but does not reset `child_ns_mode_locked`. The stale lock from the previous module load causes subsequent writes to `child_ns_mode` to silently fail: `vsock_net_set_child_mode()` sees the old lock, skips updating the actual value, and returns success when the requested mode matches the stale lock. The sysctl handler reports no error, but `child_ns_mode` remains unchanged. Steps to reproduce: $ modprobe vsock $ echo local > /proc/sys/net/vsock/child_ns_mode $ cat /proc/sys/net/vsock/child_ns_mode local $ modprobe -r vsock $ modprobe vsock $ echo local > /proc/sys/net/vsock/child_ns_mode $ cat /proc/sys/net/vsock/child_ns_mode global <--- expected "local" Fix this by initializing `child_ns_mode_locked` to 0 (unlocked) in `vsock_net_init()`, so the write-once mechanism works correctly after module reload. Fixes: 102eab95f025 ("vsock: lock down child_ns_mode as write-once") Reported-by: Jin Liu Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Link: https://patch.msgid.link/20260401092153.28462-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 2f7d94d682cb..d912ed2f012a 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -2928,6 +2928,7 @@ static void vsock_net_init(struct net *net) net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); net->vsock.child_ns_mode = net->vsock.mode; + net->vsock.child_ns_mode_locked = 0; } static __net_init int vsock_sysctl_init_net(struct net *net) -- cgit v1.2.3 From f5df2990c364d1ac596d24b3118dbc56503f7cd4 Mon Sep 17 00:00:00 2001 From: Luka Gejak Date: Wed, 1 Apr 2026 11:22:42 +0200 Subject: net: hsr: serialize seq_blocks merge across nodes During node merging, hsr_handle_sup_frame() walks node_curr->seq_blocks to update node_real without holding node_curr->seq_out_lock. This allows concurrent mutations from duplicate registration paths, risking inconsistent state or XArray/bitmap corruption. Fix this by locking both nodes' seq_out_lock during the merge. To prevent ABBA deadlocks, locks are acquired in order of memory address. Reviewed-by: Felix Maurer Fixes: 415e6367512b ("hsr: Implement more robust duplicate discard for PRP") Signed-off-by: Luka Gejak Link: https://patch.msgid.link/20260401092243.52121-2-luka.gejak@linux.dev Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 50996f4de7f9..d41863593674 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -123,6 +123,40 @@ static void hsr_free_node_rcu(struct rcu_head *rn) hsr_free_node(node); } +static void hsr_lock_seq_out_pair(struct hsr_node *node_a, + struct hsr_node *node_b) +{ + if (node_a == node_b) { + spin_lock_bh(&node_a->seq_out_lock); + return; + } + + if (node_a < node_b) { + spin_lock_bh(&node_a->seq_out_lock); + spin_lock_nested(&node_b->seq_out_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_bh(&node_b->seq_out_lock); + spin_lock_nested(&node_a->seq_out_lock, SINGLE_DEPTH_NESTING); + } +} + +static void hsr_unlock_seq_out_pair(struct hsr_node *node_a, + struct hsr_node *node_b) +{ + if (node_a == node_b) { + spin_unlock_bh(&node_a->seq_out_lock); + return; + } + + if (node_a < node_b) { + spin_unlock(&node_b->seq_out_lock); + spin_unlock_bh(&node_a->seq_out_lock); + } else { + spin_unlock(&node_a->seq_out_lock); + spin_unlock_bh(&node_b->seq_out_lock); + } +} + void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; @@ -432,7 +466,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); - spin_lock_bh(&node_real->seq_out_lock); + hsr_lock_seq_out_pair(node_real, node_curr); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { @@ -455,7 +489,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) src_blk->seq_nrs[i], HSR_SEQ_BLOCK_SIZE); } } - spin_unlock_bh(&node_real->seq_out_lock); + hsr_unlock_seq_out_pair(node_real, node_curr); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); -- cgit v1.2.3 From 2e3514e63bfb0e972b1f19668547a455d0129e88 Mon Sep 17 00:00:00 2001 From: Luka Gejak Date: Wed, 1 Apr 2026 11:22:43 +0200 Subject: net: hsr: fix VLAN add unwind on slave errors When vlan_vid_add() fails for a secondary slave, the error path calls vlan_vid_del() on the failing port instead of the peer slave that had already succeeded. This results in asymmetric VLAN state across the HSR pair. Fix this by switching to a centralized unwind path that removes the VID from any slave device that was already programmed. Fixes: 1a8a63a5305e ("net: hsr: Add VLAN CTAG filter support") Signed-off-by: Luka Gejak Link: https://patch.msgid.link/20260401092243.52121-3-luka.gejak@linux.dev Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index d1bfc49b5f01..fd2fea25eff0 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -532,8 +532,8 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { - bool is_slave_a_added = false; - bool is_slave_b_added = false; + struct net_device *slave_a_dev = NULL; + struct net_device *slave_b_dev = NULL; struct hsr_port *port; struct hsr_priv *hsr; int ret = 0; @@ -549,33 +549,35 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, switch (port->type) { case HSR_PT_SLAVE_A: if (ret) { - /* clean up Slave-B */ netdev_err(dev, "add vid failed for Slave-A\n"); - if (is_slave_b_added) - vlan_vid_del(port->dev, proto, vid); - return ret; + goto unwind; } - - is_slave_a_added = true; + slave_a_dev = port->dev; break; - case HSR_PT_SLAVE_B: if (ret) { - /* clean up Slave-A */ netdev_err(dev, "add vid failed for Slave-B\n"); - if (is_slave_a_added) - vlan_vid_del(port->dev, proto, vid); - return ret; + goto unwind; } - - is_slave_b_added = true; + slave_b_dev = port->dev; break; default: + if (ret) + goto unwind; break; } } return 0; + +unwind: + if (slave_a_dev) + vlan_vid_del(slave_a_dev, proto, vid); + + if (slave_b_dev) + vlan_vid_del(slave_b_dev, proto, vid); + + return ret; } static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, -- cgit v1.2.3 From 4e453375561fc60820e6b9d8ebeb6b3ee177d42e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Apr 2026 15:47:21 +0000 Subject: ipv6: avoid overflows in ip6_datagram_send_ctl() Yiming Qian reported : I believe I found a locally triggerable kernel bug in the IPv6 sendmsg ancillary-data path that can panic the kernel via `skb_under_panic()` (local DoS). The core issue is a mismatch between: - a 16-bit length accumulator (`struct ipv6_txoptions::opt_flen`, type `__u16`) and - a pointer to the *last* provided destination-options header (`opt->dst1opt`) when multiple `IPV6_DSTOPTS` control messages (cmsgs) are provided. - `include/net/ipv6.h`: - `struct ipv6_txoptions::opt_flen` is `__u16` (wrap possible). (lines 291-307, especially 298) - `net/ipv6/datagram.c:ip6_datagram_send_ctl()`: - Accepts repeated `IPV6_DSTOPTS` and accumulates into `opt_flen` without rejecting duplicates. (lines 909-933) - `net/ipv6/ip6_output.c:__ip6_append_data()`: - Uses `opt->opt_flen + opt->opt_nflen` to compute header sizes/headroom decisions. (lines 1448-1466, especially 1463-1465) - `net/ipv6/ip6_output.c:__ip6_make_skb()`: - Calls `ipv6_push_frag_opts()` if `opt->opt_flen` is non-zero. (lines 1930-1934) - `net/ipv6/exthdrs.c:ipv6_push_frag_opts()` / `ipv6_push_exthdr()`: - Push size comes from `ipv6_optlen(opt->dst1opt)` (based on the pointed-to header). (lines 1179-1185 and 1206-1211) 1. `opt_flen` is a 16-bit accumulator: - `include/net/ipv6.h:298` defines `__u16 opt_flen; /* after fragment hdr */`. 2. `ip6_datagram_send_ctl()` accepts *repeated* `IPV6_DSTOPTS` cmsgs and increments `opt_flen` each time: - In `net/ipv6/datagram.c:909-933`, for `IPV6_DSTOPTS`: - It computes `len = ((hdr->hdrlen + 1) << 3);` - It checks `CAP_NET_RAW` using `ns_capable(net->user_ns, CAP_NET_RAW)`. (line 922) - Then it does: - `opt->opt_flen += len;` (line 927) - `opt->dst1opt = hdr;` (line 928) There is no duplicate rejection here (unlike the legacy `IPV6_2292DSTOPTS` path which rejects duplicates at `net/ipv6/datagram.c:901-904`). If enough large `IPV6_DSTOPTS` cmsgs are provided, `opt_flen` wraps while `dst1opt` still points to a large (2048-byte) destination-options header. In the attached PoC (`poc.c`): - 32 cmsgs with `hdrlen=255` => `len = (255+1)*8 = 2048` - 1 cmsg with `hdrlen=0` => `len = 8` - Total increment: `32*2048 + 8 = 65544`, so `(__u16)opt_flen == 8` - The last cmsg is 2048 bytes, so `dst1opt` points to a 2048-byte header. 3. The transmit path sizes headers using the wrapped `opt_flen`: - In `net/ipv6/ip6_output.c:1463-1465`: - `headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + ...;` With wrapped `opt_flen`, `headersize`/headroom decisions underestimate what will be pushed later. 4. When building the final skb, the actual push length comes from `dst1opt` and is not limited by wrapped `opt_flen`: - In `net/ipv6/ip6_output.c:1930-1934`: - `if (opt->opt_flen) proto = ipv6_push_frag_opts(skb, opt, proto);` - In `net/ipv6/exthdrs.c:1206-1211`, `ipv6_push_frag_opts()` pushes `dst1opt` via `ipv6_push_exthdr()`. - In `net/ipv6/exthdrs.c:1179-1184`, `ipv6_push_exthdr()` does: - `skb_push(skb, ipv6_optlen(opt));` - `memcpy(h, opt, ipv6_optlen(opt));` With insufficient headroom, `skb_push()` underflows and triggers `skb_under_panic()` -> `BUG()`: - `net/core/skbuff.c:2669-2675` (`skb_push()` calls `skb_under_panic()`) - `net/core/skbuff.c:207-214` (`skb_panic()` ends in `BUG()`) - The `IPV6_DSTOPTS` cmsg path requires `CAP_NET_RAW` in the target netns user namespace (`ns_capable(net->user_ns, CAP_NET_RAW)`). - Root (or any task with `CAP_NET_RAW`) can trigger this without user namespaces. - An unprivileged `uid=1000` user can trigger this if unprivileged user namespaces are enabled and it can create a userns+netns to obtain namespaced `CAP_NET_RAW` (the attached PoC does this). - Local denial of service: kernel BUG/panic (system crash). - Reproducible with a small userspace PoC. This patch does not reject duplicated options, as this might break some user applications. Instead, it makes sure to adjust opt_flen and opt_nflen to correctly reflect the size of the current option headers, preventing the overflows and the potential for panics. This applies to IPV6_DSTOPTS, IPV6_HOPOPTS, and IPV6_RTHDR. Specifically: When a new IPV6_DSTOPTS is processed, the length of the old opt->dst1opt is subtracted from opt->opt_flen before adding the new length. When a new IPV6_HOPOPTS is processed, the length of the old opt->dst0opt is subtracted from opt->opt_nflen. When a new Routing Header (IPV6_RTHDR or IPV6_2292RTHDR) is processed, the length of the old opt->srcrt is subtracted from opt->opt_nflen. In the special case within IPV6_2292RTHDR handling where dst1opt is moved to dst0opt, the length of the old opt->dst0opt is subtracted from opt->opt_nflen before the new one is added. Fixes: 333fad5364d6 ("[IPV6]: Support several new sockopt / ancillary data in Advanced API (RFC3542).") Reported-by: Yiming Qian Closes: https://lore.kernel.org/netdev/CAL_bE8JNzawgr5OX5m+3jnQDHry2XxhQT5=jThW1zDPtUikRYA@mail.gmail.com/ Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260401154721.3740056-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/datagram.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c564b68a0562..993e2d76fc1f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -763,6 +763,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; + struct ipv6_rt_hdr *orthdr; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; @@ -924,9 +925,13 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { + if (opt->dst1opt) + opt->opt_flen -= ipv6_optlen(opt->dst1opt); opt->opt_flen += len; opt->dst1opt = hdr; } else { + if (opt->dst0opt) + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->opt_nflen += len; opt->dst0opt = hdr; } @@ -969,12 +974,17 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } + orthdr = opt->srcrt; + if (orthdr) + opt->opt_nflen -= ((orthdr->hdrlen + 1) << 3); opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + if (opt->dst0opt) + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; -- cgit v1.2.3