From 207b3ebacb6113acaaec0d171d5307032c690004 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Nov 2025 17:17:06 +0100 Subject: netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation Ulrich reports a regression with nfqueue: If an application did not set the 'F_GSO' capability flag and a gso packet with an unconfirmed nf_conn entry is received all packets are now dropped instead of queued, because the check happens after skb_gso_segment(). In that case, we did have exclusive ownership of the skb and its associated conntrack entry. The elevated use count is due to skb_clone happening via skb_gso_segment(). Move the check so that its peformed vs. the aggregated packet. Then, annotate the individual segments except the first one so we can do a 2nd check at reinject time. For the normal case, where userspace does in-order reinjects, this avoids packet drops: first reinjected segment continues traversal and confirms entry, remaining segments observe the confirmed entry. While at it, simplify nf_ct_drop_unconfirmed(): We only care about unconfirmed entries with a refcnt > 1, there is no need to special-case dying entries. This only happens with UDP. With TCP, the only unconfirmed packet will be the TCP SYN, those aren't aggregated by GRO. Next patch adds a udpgro test case to cover this scenario. Reported-by: Ulrich Weber Fixes: 7d8dc1c7be8d ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks") Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index e6803831d6af..45eb26b2e95b 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -21,6 +21,7 @@ struct nf_queue_entry { struct net_device *physout; #endif struct nf_hook_state state; + bool nf_ct_is_unconfirmed; u16 size; /* sizeof(entry) + saved route keys */ u16 queue_num; -- cgit v1.2.3 From 1e13f27e0675552161ab1778be9a23a636dde8a7 Mon Sep 17 00:00:00 2001 From: Anders Grahn Date: Tue, 3 Feb 2026 14:48:30 +0100 Subject: netfilter: nft_counter: fix reset of counters on 32bit archs nft_counter_reset() calls u64_stats_add() with a negative value to reset the counter. This will work on 64bit archs, hence the negative value added will wrap as a 64bit value which then can wrap the stat counter as well. On 32bit archs, the added negative value will wrap as a 32bit value and _not_ wrapping the stat counter properly. In most cases, this would just lead to a very large 32bit value being added to the stat counter. Fix by introducing u64_stats_sub(). Fixes: 4a1d3acd6ea8 ("netfilter: nft_counter: Use u64_stats_t for statistic.") Signed-off-by: Anders Grahn Signed-off-by: Florian Westphal --- include/linux/u64_stats_sync.h | 10 ++++++++++ net/netfilter/nft_counter.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 849ff6e159c6..f47cf4d78ad7 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -97,6 +97,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val) local64_add(val, &p->v); } +static inline void u64_stats_sub(u64_stats_t *p, s64 val) +{ + local64_sub(val, &p->v); +} + static inline void u64_stats_inc(u64_stats_t *p) { local64_inc(&p->v); @@ -145,6 +150,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val) p->v += val; } +static inline void u64_stats_sub(u64_stats_t *p, s64 val) +{ + p->v -= val; +} + static inline void u64_stats_inc(u64_stats_t *p) { p->v++; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index cc7325329496..0d70325280cc 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -117,8 +117,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv, nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); - u64_stats_add(&this_cpu->packets, -total->packets); - u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_sub(&this_cpu->packets, total->packets); + u64_stats_sub(&this_cpu->bytes, total->bytes); u64_stats_update_end(nft_sync); local_bh_enable(); -- cgit v1.2.3 From 648946966a08e4cb1a71619e3d1b12bd7642de7b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Feb 2026 13:33:46 +0100 Subject: netfilter: nft_set_rbtree: validate open interval overlap Open intervals do not have an end element, in particular an open interval at the end of the set is hard to validate because of it is lacking the end element, and interval validation relies on such end element to perform the checks. This patch adds a new flag field to struct nft_set_elem, this is not an issue because this is a temporary object that is allocated in the stack from the insert/deactivate path. This flag field is used to specify that this is the last element in this add/delete command. The last flag is used, in combination with the start element cookie, to check if there is a partial overlap, eg. Already exists: 255.255.255.0-255.255.255.254 Add interval: 255.255.255.0-255.255.255.255 ~~~~~~~~~~~~~ start element overlap Basically, the idea is to check for an existing end element in the set if there is an overlap with an existing start element. However, the last open interval can come in any position in the add command, the corner case can get a bit more complicated: Already exists: 255.255.255.0-255.255.255.254 Add intervals: 255.255.255.0-255.255.255.255,255.255.255.0-255.255.255.254 ~~~~~~~~~~~~~ start element overlap To catch this overlap, annotate that the new start element is a possible overlap, then report the overlap if the next element is another start element that confirms that previous element in an open interval at the end of the set. For deletions, do not update the start cookie when deleting an open interval, otherwise this can trigger spurious EEXIST when adding new elements. Unfortunately, there is no NFT_SET_ELEM_INTERVAL_OPEN flag which would make easier to detect open interval overlaps. Fixes: 7c84d41416d8 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 4 +++ net/netfilter/nf_tables_api.c | 21 +++++++++--- net/netfilter/nft_set_rbtree.c | 71 +++++++++++++++++++++++++++++++++------ 3 files changed, 82 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 31906f90706e..426534a711b0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -277,6 +277,8 @@ struct nft_userdata { unsigned char data[]; }; +#define NFT_SET_ELEM_INTERNAL_LAST 0x1 + /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; @@ -286,6 +288,7 @@ struct nft_elem_priv { }; * @key: element key * @key_end: closing element key * @data: element data + * @flags: flags * @priv: element private data and extensions */ struct nft_set_elem { @@ -301,6 +304,7 @@ struct nft_set_elem { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; + u32 flags; struct nft_elem_priv *priv; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d7773c9bbcff..1ed034a47bd0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7270,7 +7270,8 @@ static u32 nft_set_maxsize(const struct nft_set *set) } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr, u32 nlmsg_flags) + const struct nlattr *attr, u32 nlmsg_flags, + bool last) { struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -7556,6 +7557,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (last) + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; + else + elem.flags = 0; + if (obj) *nft_set_ext_obj(ext) = obj; @@ -7719,7 +7725,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags, + nla_is_last(attr, rem)); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; @@ -7843,7 +7850,7 @@ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool last) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; @@ -7911,6 +7918,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (last) + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; + else + elem.flags = 0; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) goto fail_trans; @@ -8058,7 +8070,8 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return nft_set_flush(&ctx, set, genmask); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_del_setelem(&ctx, set, attr); + err = nft_del_setelem(&ctx, set, attr, + nla_is_last(attr, rem)); if (err == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) continue; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index a4fb5b517d9d..644d4b916705 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -304,10 +304,19 @@ static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv, priv->start_rbe_cookie = (unsigned long)rbe; } +static void nft_rbtree_set_start_cookie_open(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe, + unsigned long open_interval) +{ + priv->start_rbe_cookie = (unsigned long)rbe | open_interval; +} + +#define NFT_RBTREE_OPEN_INTERVAL 1UL + static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv, const struct nft_rbtree_elem *rbe) { - return priv->start_rbe_cookie == (unsigned long)rbe; + return (priv->start_rbe_cookie & ~NFT_RBTREE_OPEN_INTERVAL) == (unsigned long)rbe; } static bool nft_rbtree_insert_same_interval(const struct net *net, @@ -337,13 +346,14 @@ static bool nft_rbtree_insert_same_interval(const struct net *net, static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_elem_priv **elem_priv, u64 tstamp) + struct nft_elem_priv **elem_priv, u64 tstamp, bool last) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); + unsigned long open_interval = 0; int d; /* Descend the tree to search for an existing element greater than the @@ -449,10 +459,18 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } - if (nft_rbtree_interval_null(set, new)) - priv->start_rbe_cookie = 0; - else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) + if (nft_rbtree_interval_null(set, new)) { priv->start_rbe_cookie = 0; + } else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) { + if (nft_set_is_anonymous(set)) { + priv->start_rbe_cookie = 0; + } else if (priv->start_rbe_cookie & NFT_RBTREE_OPEN_INTERVAL) { + /* Previous element is an open interval that partially + * overlaps with an existing non-open interval. + */ + return -ENOTEMPTY; + } + } /* - new start element matching existing start element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. @@ -460,7 +478,27 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { *elem_priv = &rbe_ge->priv; - nft_rbtree_set_start_cookie(priv, rbe_ge); + + /* - Corner case: new start element of open interval (which + * comes as last element in the batch) overlaps the start of + * an existing interval with an end element: partial overlap. + */ + node = rb_first(&priv->root); + rbe = __nft_rbtree_next_active(node, genmask); + if (rbe && nft_rbtree_interval_end(rbe)) { + rbe = nft_rbtree_next_active(rbe, genmask); + if (rbe && + nft_rbtree_interval_start(rbe) && + !nft_rbtree_cmp(set, new, rbe)) { + if (last) + return -ENOTEMPTY; + + /* Maybe open interval? */ + open_interval = NFT_RBTREE_OPEN_INTERVAL; + } + } + nft_rbtree_set_start_cookie_open(priv, rbe_ge, open_interval); + return -EEXIST; } @@ -515,6 +553,12 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - start element overlaps an open interval but end element is new: + * partial overlap, reported as -ENOEMPTY. + */ + if (!rbe_ge && priv->start_rbe_cookie && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + /* Accepted element: pick insertion point depending on key value */ parent = NULL; p = &priv->root.rb_node; @@ -624,6 +668,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); struct nft_rbtree *priv = nft_set_priv(set); u64 tstamp = nft_net_tstamp(net); int err; @@ -640,8 +685,12 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, cond_resched(); write_lock_bh(&priv->lock); - err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp); + err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp, last); write_unlock_bh(&priv->lock); + + if (nft_rbtree_interval_end(rbe)) + priv->start_rbe_cookie = 0; + } while (err == -EAGAIN); return err; @@ -729,6 +778,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); @@ -769,9 +819,10 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, continue; } - if (nft_rbtree_interval_start(rbe)) - nft_rbtree_set_start_cookie(priv, rbe); - else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) + if (nft_rbtree_interval_start(rbe)) { + if (!last) + nft_rbtree_set_start_cookie(priv, rbe); + } else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) return NULL; nft_rbtree_flush(net, set, &rbe->priv); -- cgit v1.2.3