From 77facb35227c421467cdb49268de433168c2dcef Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Thu, 2 Apr 2026 17:23:16 -0500 Subject: net: increase IP_TUNNEL_RECURSION_LIMIT to 5 In configurations with multiple tunnel layers and MPLS lwtunnel routing, a single tunnel hop can increment the counter beyond this limit. This causes packets to be dropped with the "Dead loop on virtual device" message even when a routing loop doesn't exist. Increase IP_TUNNEL_RECURSION_LIMIT from 4 to 5 to handle this use-case. Fixes: 6f1a9140ecda ("net: add xmit recursion limit to tunnel xmit functions") Link: https://lore.kernel.org/netdev/88deb91b-ef1b-403c-8eeb-0f971f27e34f@redhat.com/ Signed-off-by: Chris J Arges Link: https://patch.msgid.link/20260402222401.3408368-1-carges@cloudflare.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 1f577a4f8ce9..d708b66e55cd 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -32,7 +32,7 @@ * recursion involves route lookups and full IP output, consuming much * more stack per level, so a lower limit is needed. */ -#define IP_TUNNEL_RECURSION_LIMIT 4 +#define IP_TUNNEL_RECURSION_LIMIT 5 /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) -- cgit v1.2.3 From 1ee1605138fc94cc8f8f273321dd2471c64977f9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:52 +0200 Subject: xsk: respect tailroom for ZC setups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-buffer XDP stores information about frags in skb_shared_info that sits at the tailroom of a packet. The storage space is reserved via xdp_data_hard_end(): ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) and then we refer to it via macro below: static inline struct skb_shared_info * xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } Currently we do not respect this tailroom space in multi-buffer AF_XDP ZC scenario. To address this, introduce xsk_pool_get_tailroom() and use it within xsk_pool_get_rx_frame_size() which is used in ZC drivers to configure length of HW Rx buffer. Typically drivers on Rx Hw buffers side work on 128 byte alignment so let us align the value returned by xsk_pool_get_rx_frame_size() in order to avoid addressing this on driver's side. This addresses the fact that idpf uses mentioned function *before* pool->dev being set so we were at risk that after subtracting tailroom we would not provide 128-byte aligned value to HW. Since xsk_pool_get_rx_frame_size() is actively used in xsk_rcv_check() and __xsk_rcv(), add a variant of this routine that will not include 128 byte alignment and therefore old behavior is preserved. Reviewed-by: Björn Töpel Acked-by: Stanislav Fomichev Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-3-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 23 ++++++++++++++++++++++- net/xdp/xsk.c | 4 ++-- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 6b9ebae2dc95..46797645a0c2 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -41,16 +41,37 @@ static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) return XDP_PACKET_HEADROOM + pool->headroom; } +static inline u32 xsk_pool_get_tailroom(bool mbuf) +{ + return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0; +} + static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } -static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +{ + u32 frame_size = __xsk_pool_get_rx_frame_size(pool); + struct xdp_umem *umem = pool->umem; + bool mbuf; + + /* Reserve tailroom only for zero-copy pools that opted into + * multi-buffer. The reserved area is used for skb_shared_info, + * matching the XDP core's xdp_data_hard_end() layout. + */ + mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG); + frame_size -= xsk_pool_get_tailroom(mbuf); + + return ALIGN_DOWN(frame_size, 128); +} + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) { return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6149f6a79897..c8ef9e427c9c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -239,7 +239,7 @@ static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; @@ -338,7 +338,7 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } -- cgit v1.2.3 From 93e84fe45b752d17a5a46b306ed78f0133bbc719 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:53 +0200 Subject: xsk: fix XDP_UMEM_SG_FLAG issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently xp_assign_dev_shared() is missing XDP_USE_SG being propagated to flags so set it in order to preserve mtu check that is supposed to be done only when no multi-buffer setup is in picture. Also, this flag has the same value as XDP_UMEM_TX_SW_CSUM so we could get unexpected SG setups for software Tx checksums. Since csum flag is UAPI, modify value of XDP_UMEM_SG_FLAG. Fixes: d609f3d228a8 ("xsk: add multi-buffer support for sockets sharing umem") Reviewed-by: Björn Töpel Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-4-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock.h | 2 +- net/xdp/xsk_buff_pool.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 23e8861e8b25..ebac60a3d8a1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -14,7 +14,7 @@ #include #include -#define XDP_UMEM_SG_FLAG (1 << 1) +#define XDP_UMEM_SG_FLAG BIT(3) struct net_device; struct xsk_queue; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 37b7a68b89b3..729602a3cec0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -247,6 +247,10 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct xdp_umem *umem = umem_xs->umem; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + + if (umem->flags & XDP_UMEM_SG_FLAG) + flags |= XDP_USE_SG; + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; -- cgit v1.2.3 From f8dca15a1b190787bbd03285304b569631160eda Mon Sep 17 00:00:00 2001 From: Tuan Do Date: Fri, 3 Apr 2026 00:33:17 -0700 Subject: netfilter: nft_ct: fix use-after-free in timeout object destroy nft_ct_timeout_obj_destroy() frees the timeout object with kfree() immediately after nf_ct_untimeout(), without waiting for an RCU grace period. Concurrent packet processing on other CPUs may still hold RCU-protected references to the timeout object obtained via rcu_dereference() in nf_ct_timeout_data(). Add an rcu_head to struct nf_ct_timeout and use kfree_rcu() to defer freeing until after an RCU grace period, matching the approach already used in nfnetlink_cttimeout.c. KASAN report: BUG: KASAN: slab-use-after-free in nf_conntrack_tcp_packet+0x1381/0x29d0 Read of size 4 at addr ffff8881035fe19c by task exploit/80 Call Trace: nf_conntrack_tcp_packet+0x1381/0x29d0 nf_conntrack_in+0x612/0x8b0 nf_hook_slow+0x70/0x100 __ip_local_out+0x1b2/0x210 tcp_sendmsg_locked+0x722/0x1580 __sys_sendto+0x2d8/0x320 Allocated by task 75: nft_ct_timeout_obj_init+0xf6/0x290 nft_obj_init+0x107/0x1b0 nf_tables_newobj+0x680/0x9c0 nfnetlink_rcv_batch+0xc29/0xe00 Freed by task 26: nft_obj_destroy+0x3f/0xa0 nf_tables_trans_destroy_work+0x51c/0x5c0 process_one_work+0x2c4/0x5a0 Fixes: 7e0b2b57f01d ("netfilter: nft_ct: add ct timeout support") Cc: stable@vger.kernel.org Signed-off-by: Tuan Do Signed-off-by: Florian Westphal --- include/net/netfilter/nf_conntrack_timeout.h | 1 + net/netfilter/nft_ct.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9fdaba911de6..3a66d4abb6d6 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -14,6 +14,7 @@ struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; + struct rcu_head rcu; char data[]; }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 128ff8155b5d..04c74ccf9b84 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1020,7 +1020,7 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, nf_queue_nf_hook_drop(ctx->net); nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); - kfree(priv->timeout); + kfree_rcu(priv->timeout, rcu); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, -- cgit v1.2.3 From 936206e3f6ff411581e615e930263d6f8b78df9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Apr 2026 17:00:01 +0200 Subject: netfilter: nfnetlink_queue: make hash table per queue Sharing a global hash table among all queues is tempting, but it can cause crash: BUG: KASAN: slab-use-after-free in nfqnl_recv_verdict+0x11ac/0x15e0 [nfnetlink_queue] [..] nfqnl_recv_verdict+0x11ac/0x15e0 [nfnetlink_queue] nfnetlink_rcv_msg+0x46a/0x930 kmem_cache_alloc_node_noprof+0x11e/0x450 struct nf_queue_entry is freed via kfree, but parallel cpu can still encounter such an nf_queue_entry when walking the list. Alternative fix is to free the nf_queue_entry via kfree_rcu() instead, but as we have to alloc/free for each skb this will cause more mem pressure. Cc: Scott Mitchell Fixes: e19079adcd26 ("netfilter: nfnetlink_queue: optimize verdict lookup with hash table") Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 1 - net/netfilter/nfnetlink_queue.c | 139 ++++++++++++++------------------------- 2 files changed, 49 insertions(+), 91 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 45eb26b2e95b..d17035d14d96 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -23,7 +23,6 @@ struct nf_queue_entry { struct nf_hook_state state; bool nf_ct_is_unconfirmed; u16 size; /* sizeof(entry) + saved route keys */ - u16 queue_num; /* extra space to store route keys */ }; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 47f7f62906e2..8e02f84784da 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -49,8 +49,8 @@ #endif #define NFQNL_QMAX_DEFAULT 1024 -#define NFQNL_HASH_MIN 1024 -#define NFQNL_HASH_MAX 1048576 +#define NFQNL_HASH_MIN 8 +#define NFQNL_HASH_MAX 32768 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we @@ -60,29 +60,10 @@ */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) -/* Composite key for packet lookup: (net, queue_num, packet_id) */ -struct nfqnl_packet_key { - possible_net_t net; - u32 packet_id; - u16 queue_num; -} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */ - -/* Global rhashtable - one for entire system, all netns */ -static struct rhashtable nfqnl_packet_map __read_mostly; - -/* Helper to initialize composite key */ -static inline void nfqnl_init_key(struct nfqnl_packet_key *key, - struct net *net, u32 packet_id, u16 queue_num) -{ - memset(key, 0, sizeof(*key)); - write_pnet(&key->net, net); - key->packet_id = packet_id; - key->queue_num = queue_num; -} - struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ - struct rcu_head rcu; + struct rhashtable nfqnl_packet_map; + struct rcu_work rwork; u32 peer_portid; unsigned int queue_maxlen; @@ -106,6 +87,7 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); +static struct workqueue_struct *nfq_cleanup_wq __read_mostly; static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 @@ -124,34 +106,10 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } -/* Extract composite key from nf_queue_entry for hashing */ -static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nf_queue_entry *entry = data; - struct nfqnl_packet_key key; - - nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num); - - return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed); -} - -/* Compare stack-allocated key against entry */ -static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nfqnl_packet_key *key = arg->key; - const struct nf_queue_entry *entry = obj; - - return !net_eq(entry->state.net, read_pnet(&key->net)) || - entry->queue_num != key->queue_num || - entry->id != key->packet_id; -} - static const struct rhashtable_params nfqnl_rhashtable_params = { .head_offset = offsetof(struct nf_queue_entry, hash_node), - .key_len = sizeof(struct nfqnl_packet_key), - .obj_hashfn = nfqnl_packet_obj_hashfn, - .obj_cmpfn = nfqnl_packet_obj_cmpfn, + .key_offset = offsetof(struct nf_queue_entry, id), + .key_len = sizeof(u32), .automatic_shrinking = true, .min_size = NFQNL_HASH_MIN, .max_size = NFQNL_HASH_MAX, @@ -190,6 +148,10 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + err = rhashtable_init(&inst->nfqnl_packet_map, &nfqnl_rhashtable_params); + if (err < 0) + goto out_free; + spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; @@ -210,6 +172,8 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) out_unlock: spin_unlock(&q->instances_lock); + rhashtable_destroy(&inst->nfqnl_packet_map); +out_free: kfree(inst); return ERR_PTR(err); } @@ -217,15 +181,18 @@ out_unlock: static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); -static void -instance_destroy_rcu(struct rcu_head *head) +static void instance_destroy_work(struct work_struct *work) { - struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, - rcu); + struct nfqnl_instance *inst; + inst = container_of(to_rcu_work(work), struct nfqnl_instance, + rwork); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); + + rhashtable_destroy(&inst->nfqnl_packet_map); + kfree(inst); module_put(THIS_MODULE); } @@ -234,7 +201,9 @@ static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); - call_rcu(&inst->rcu, instance_destroy_rcu); + + INIT_RCU_WORK(&inst->rwork, instance_destroy_work); + queue_rcu_work(nfq_cleanup_wq, &inst->rwork); } static void @@ -250,9 +219,7 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { int err; - entry->queue_num = queue->queue_num; - - err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node, + err = rhashtable_insert_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); if (unlikely(err)) return err; @@ -266,23 +233,19 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node, + rhashtable_remove_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id, - struct net *net) +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { - struct nfqnl_packet_key key; struct nf_queue_entry *entry; - nfqnl_init_key(&key, net, id, queue->queue_num); - spin_lock_bh(&queue->lock); - entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key, + entry = rhashtable_lookup_fast(&queue->nfqnl_packet_map, &id, nfqnl_rhashtable_params); if (entry) @@ -1531,7 +1494,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, verdict = ntohl(vhdr->verdict); - entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net); + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; @@ -1880,40 +1843,38 @@ static int __init nfnetlink_queue_init(void) { int status; - status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params); - if (status < 0) - return status; + nfq_cleanup_wq = alloc_ordered_workqueue("nfq_workqueue", 0); + if (!nfq_cleanup_wq) + return -ENOMEM; status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_rhashtable; - } + if (status < 0) + goto cleanup_pernet_subsys; - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - pr_err("failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } + status = netlink_register_notifier(&nfqnl_rtnl_notifier); + if (status < 0) + goto cleanup_rtnl_notifier; status = register_netdevice_notifier(&nfqnl_dev_notifier); - if (status < 0) { - pr_err("failed to register netdevice notifier\n"); - goto cleanup_netlink_subsys; - } + if (status < 0) + goto cleanup_dev_notifier; + + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) + goto cleanup_nfqnl_subsys; nf_register_queue_handler(&nfqh); return status; -cleanup_netlink_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); -cleanup_netlink_notifier: +cleanup_nfqnl_subsys: + unregister_netdevice_notifier(&nfqnl_dev_notifier); +cleanup_dev_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +cleanup_rtnl_notifier: unregister_pernet_subsys(&nfnl_queue_net_ops); -cleanup_rhashtable: - rhashtable_destroy(&nfqnl_packet_map); +cleanup_pernet_subsys: + destroy_workqueue(nfq_cleanup_wq); return status; } @@ -1924,9 +1885,7 @@ static void __exit nfnetlink_queue_fini(void) nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); - - rhashtable_destroy(&nfqnl_packet_map); - + destroy_workqueue(nfq_cleanup_wq); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } -- cgit v1.2.3