From f902877b635551513729bdf9a8d1422c4aab7741 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:02 +0200 Subject: rculist: add list_splice_rcu() for private lists This patch adds a helper function, list_splice_rcu(), to safely splice a private (non-RCU-protected) list into an RCU-protected list. The function ensures that only the pointer visible to RCU readers (prev->next) is updated using rcu_assign_pointer(), while the rest of the list manipulations are performed with regular assignments, as the source list is private and not visible to concurrent RCU readers. This is useful for moving elements from a private list into a global RCU-protected list, ensuring safe publication for RCU readers. Subsystems with some sort of batching mechanism from userspace can benefit from this new function. The function __list_splice_rcu() has been added for clarity and to follow the same pattern as in the existing list_splice*() interfaces, where there is a check to ensure that the list to splice is not empty. Note that __list_splice_rcu() has no documentation for this reason. Reviewed-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2abba7552605..e3bc44225692 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old, old->prev = LIST_POISON2; } +static inline void __list_splice_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + last->next = next; + first->prev = prev; + next->prev = last; + rcu_assign_pointer(list_next_rcu(prev), first); +} + +/** + * list_splice_rcu - splice a non-RCU list into an RCU-protected list, + * designed for stacks. + * @list: the non RCU-protected list to splice + * @head: the place in the existing RCU-protected list to splice + * + * The list pointed to by @head can be RCU-read traversed concurrently with + * this function. + */ +static inline void list_splice_rcu(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_rcu(list, head, head->next); +} + /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice -- cgit v1.2.3 From 10f79dbd7719d1da9f5884d13060322d8729f091 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 22:58:23 +0200 Subject: netfilter: nf_tables: add hook transactions for device deletions Restore the flag that indicates that the hook is going away, ie. NFT_HOOK_REMOVE, but add a new transaction object to track deletion of hooks without altering the basechain/flowtable hook_list during the preparation phase. The existing approach that moves the hook from the basechain/flowtable hook_list to transaction hook_list breaks netlink dump path readers of this RCU-protected list. It should be possible use an array for nft_trans_hook to store the deleted hooks to compact the representation but I am not expecting many hook object, specially now that wildcard support for devices is in place. Note that the nft_trans_chain_hooks() list contains a list of struct nft_trans_hook objects for DELCHAIN and DELFLOWTABLE commands, while this list stores struct nft_hook objects for NEWCHAIN and NEWFLOWTABLE. Note that new commands can be updated to use nft_trans_hook for consistency. This patch also adapts the event notification path to deal with the list of hook transactions. Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Fixes: b6d9014a3335 ("netfilter: nf_tables: delete flowtable hooks via transaction list") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++ net/netfilter/nf_tables_api.c | 264 +++++++++++++++++++++++++++++--------- 2 files changed, 217 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2c0173d9309c..cff7b773e972 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1204,12 +1204,15 @@ struct nft_stats { struct u64_stats_sync syncp; }; +#define NFT_HOOK_REMOVE (1 << 0) + struct nft_hook { struct list_head list; struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; + u8 flags; }; struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, @@ -1664,6 +1667,16 @@ struct nft_trans { u8 put_net:1; }; +/** + * struct nft_trans_hook - nf_tables hook update in transaction + * @list: used internally + * @hook: struct nft_hook with the device hook + */ +struct nft_trans_hook { + struct list_head list; + struct nft_hook *hook; +}; + /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ae10116af923..d20ce5c36d31 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -380,6 +380,32 @@ static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook) nft_netdev_hook_free_rcu(hook); } +static void nft_trans_hook_destroy(struct nft_trans_hook *trans_hook) +{ + list_del(&trans_hook->list); + kfree(trans_hook); +} + +static void nft_netdev_unregister_trans_hook(struct net *net, + const struct nft_table *table, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) @@ -1946,15 +1972,69 @@ static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) return nla_put_string(skb, attr, hook->ifname); } +struct nft_hook_dump_ctx { + struct nft_hook *first; + int n; +}; + +static int nft_dump_basechain_hook_one(struct sk_buff *skb, + struct nft_hook *hook, + struct nft_hook_dump_ctx *dump_ctx) +{ + if (!dump_ctx->first) + dump_ctx->first = hook; + + if (nft_nla_put_hook_dev(skb, hook)) + return -1; + + dump_ctx->n++; + + return 0; +} + +static int nft_dump_basechain_hook_list(struct sk_buff *skb, + const struct net *net, + const struct list_head *hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_hook *hook; + int err; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + err = nft_dump_basechain_hook_one(skb, hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + +static int nft_dump_basechain_trans_hook_list(struct sk_buff *skb, + const struct list_head *trans_hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_trans_hook *trans_hook; + int err; + + list_for_each_entry(trans_hook, trans_hook_list, list) { + err = nft_dump_basechain_hook_one(skb, trans_hook->hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { const struct nf_hook_ops *ops = &basechain->ops; - struct nft_hook *hook, *first = NULL; + struct nft_hook_dump_ctx dump_hook_ctx = {}; struct nlattr *nest, *nest_devs; - int n = 0; nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) @@ -1969,23 +2049,23 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &basechain->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (!first) - first = hook; - - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; - n++; + if (hook_list && + nft_dump_basechain_hook_list(skb, net, hook_list, &dump_hook_ctx)) { + goto nla_put_failure; + } else if (trans_hook_list && + nft_dump_basechain_trans_hook_list(skb, trans_hook_list, + &dump_hook_ctx)) { + goto nla_put_failure; } + nla_nest_end(skb, nest_devs); - if (n == 1 && - !hook_is_prefix(first) && - nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) + if (dump_hook_ctx.n == 1 && + !hook_is_prefix(dump_hook_ctx.first) && + nla_put_string(skb, NFTA_HOOK_DEV, dump_hook_ctx.first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1999,7 +2079,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nlmsghdr *nlh; @@ -2015,7 +2096,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, NFTA_CHAIN_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELCHAIN || event == NFT_MSG_DESTROYCHAIN)) { nlmsg_end(skb, nlh); @@ -2026,7 +2107,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) + if (nft_dump_basechain_hook(skb, net, family, basechain, + hook_list, trans_hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -2062,7 +2144,8 @@ nla_put_failure: } static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; @@ -2082,7 +2165,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, hook_list); + ctx->chain, hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -2128,7 +2211,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, - chain, NULL) < 0) + chain, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -2182,7 +2265,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, - 0, family, table, chain, NULL); + 0, family, table, chain, NULL, NULL); if (err < 0) goto err_fill_chain_info; @@ -2345,8 +2428,12 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, list_for_each_entry(hook, hook_list, list) { if (!strncmp(hook->ifname, this->ifname, - min(hook->ifnamelen, this->ifnamelen))) + min(hook->ifnamelen, this->ifnamelen))) { + if (hook->flags & NFT_HOOK_REMOVE) + continue; + return hook; + } } return NULL; @@ -3105,6 +3192,32 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, policy, flags, extack); } +static int nft_trans_delhook(struct nft_hook *hook, + struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook; + + trans_hook = kmalloc_obj(*trans_hook, GFP_KERNEL); + if (!trans_hook) + return -ENOMEM; + + trans_hook->hook = hook; + list_add_tail(&trans_hook->list, del_list); + hook->flags |= NFT_HOOK_REMOVE; + + return 0; +} + +static void nft_trans_delhook_abort(struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook, *next; + + list_for_each_entry_safe(trans_hook, next, del_list, list) { + trans_hook->hook->flags &= ~NFT_HOOK_REMOVE; + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) @@ -3131,7 +3244,10 @@ static int nft_delchain_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_chain_del_hook; } - list_move(&hook->list, &chain_del_list); + if (nft_trans_delhook(hook, &chain_del_list) < 0) { + err = -ENOMEM; + goto err_chain_del_hook; + } } trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); @@ -3151,7 +3267,7 @@ static int nft_delchain_hook(struct nft_ctx *ctx, return 0; err_chain_del_hook: - list_splice(&chain_del_list, &basechain->hook_list); + nft_trans_delhook_abort(&chain_del_list); nft_chain_release_hook(&chain_hook); return err; @@ -8941,6 +9057,24 @@ static void nft_hooks_destroy(struct list_head *hook_list) nft_netdev_hook_unlink_free_rcu(hook); } +static void nft_flowtable_unregister_trans_hook(struct net *net, + struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); + + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) @@ -9199,7 +9333,10 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_flowtable_del_hook; } - list_move(&hook->list, &flowtable_del_list); + if (nft_trans_delhook(hook, &flowtable_del_list) < 0) { + err = -ENOMEM; + goto err_flowtable_del_hook; + } } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, @@ -9220,7 +9357,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, return 0; err_flowtable_del_hook: - list_splice(&flowtable_del_list, &flowtable->hook_list); + nft_trans_delhook_abort(&flowtable_del_list); nft_flowtable_hook_release(&flowtable_hook); return err; @@ -9285,8 +9422,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, struct nft_flowtable *flowtable, - struct list_head *hook_list) + struct list_head *hook_list, + struct list_head *trans_hook_list) { + struct nft_trans_hook *trans_hook; struct nlattr *nest, *nest_devs; struct nft_hook *hook; struct nlmsghdr *nlh; @@ -9303,7 +9442,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, NFTA_FLOWTABLE_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELFLOWTABLE || event == NFT_MSG_DESTROYFLOWTABLE)) { nlmsg_end(skb, nlh); @@ -9325,13 +9464,20 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; + if (hook_list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nft_nla_put_hook_dev(skb, hook)) + goto nla_put_failure; + } + } else if (trans_hook_list) { + list_for_each_entry(trans_hook, trans_hook_list, list) { + if (nft_nla_put_hook_dev(skb, trans_hook->hook)) + goto nla_put_failure; + } } nla_nest_end(skb, nest_devs); nla_nest_end(skb, nest); @@ -9385,7 +9531,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, - flowtable, NULL) < 0) + flowtable, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -9485,7 +9631,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable, NULL); + flowtable, NULL, NULL); if (err < 0) goto err_fill_flowtable_info; @@ -9498,7 +9644,9 @@ err_fill_flowtable_info: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, - struct list_head *hook_list, int event) + struct list_head *hook_list, + struct list_head *trans_hook_list, + int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; @@ -9518,7 +9666,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, - ctx->family, flowtable, hook_list); + ctx->family, flowtable, + hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -10052,9 +10201,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: - if (nft_trans_chain_update(trans)) - nft_hooks_destroy(&nft_trans_chain_hooks(trans)); - else + if (!nft_trans_chain_update(trans)) nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: @@ -10075,9 +10222,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: - if (nft_trans_flowtable_update(trans)) - nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); - else + if (!nft_trans_flowtable_update(trans)) nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -10837,31 +10982,28 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nft_chain_commit_update(nft_trans_container_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, - &nft_trans_chain_hooks(trans)); + &nft_trans_chain_hooks(trans), NULL); list_splice_rcu(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); nft_clear(net, nft_trans_chain(trans)); - nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL, &nft_trans_chain_hooks(trans)); - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_trans_hook(net, table, + &nft_trans_chain_hooks(trans)); } else { nft_chain_del(nft_trans_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, - NULL); + NULL, NULL); nf_tables_unregister_hook(ctx.net, ctx.table, nft_trans_chain(trans)); } @@ -10967,6 +11109,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), + NULL, NFT_MSG_NEWFLOWTABLE); list_splice_rcu(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); @@ -10975,6 +11118,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); @@ -10984,16 +11128,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), + NULL, &nft_trans_flowtable_hooks(trans), trans->msg_type); - nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans), - &nft_trans_flowtable_hooks(trans)); + nft_flowtable_unregister_trans_hook(net, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), @@ -11157,8 +11303,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - list_splice(&nft_trans_chain_hooks(trans), - &nft_trans_basechain(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_chain_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_chain(trans)); @@ -11272,8 +11417,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - list_splice(&nft_trans_flowtable_hooks(trans), - &nft_trans_flowtable(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_flowtable_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_flowtable(trans)); -- cgit v1.2.3 From b3b6babf47517fde6b6de2493dea28e8831b9347 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Apr 2026 05:34:54 +0000 Subject: ipmr: Free mr_table after RCU grace period. With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() does not check if net->ipv4.mrt is NULL. Since default_device_exit_batch() is called after ->exit_rtnl(), a device could receive IGMP packets and access net->ipv4.mrt during/after ipmr_rules_exit_rtnl(). If ipmr_rules_exit_rtnl() had already cleared it and freed the memory, the access would trigger null-ptr-deref or use-after-free. Let's fix it by using RCU helper and free mrt after RCU grace period. In addition, check_net(net) is added to mroute_clean_tables() and ipmr_cache_unresolved() to synchronise via mfc_unres_lock. This prevents ipmr_cache_unresolved() from putting skb into c->_c.mfc_un.unres.unresolved after mroute_clean_tables() purges it. For the same reason, timer_shutdown_sync() is moved after mroute_clean_tables(). Since rhltable_destroy() holds mutex internally, rcu_work is used, and it is placed as the first member because rcu_head must be placed within <4K offset. mr_table is alraedy 3864 bytes without rcu_work. Note that IP6MR is not yet converted to ->exit_rtnl(), so this change is not needed for now but will be. Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260423053456.4097409-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 3 ++ net/ipv4/ipmr.c | 108 ++++++++++++++++++++++++-------------------- net/ipv4/ipmr_base.c | 16 +++++++ 3 files changed, 77 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..5d75cc5b057e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -226,6 +226,7 @@ struct mr_table_ops { /** * struct mr_table - a multicast routing table + * @work: used for table destruction * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations @@ -243,6 +244,7 @@ struct mr_table_ops { * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { + struct rcu_work work; struct list_head list; possible_net_t net; struct mr_table_ops ops; @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask); +void mr_table_free(struct mr_table *mrt); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8a08d09b4c30..2058ca860294 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id) return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) -{ - struct mr_table *mrt; - - rcu_read_lock(); - mrt = __ipmr_get_table(net, id); - rcu_read_unlock(); - return mrt; -} - static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { @@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct mr_table *mrt, *next; list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { - list_del(&mrt->list); + list_del_rcu(&mrt->list); ipmr_free_table(mrt, dev_kill_list); } } @@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL(ipmr_rule_default); #else -#define ipmr_for_each_table(mrt, net) \ - for (mrt = net->ipv4.mrt; mrt; mrt = NULL) - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) - return net->ipv4.mrt; + return rcu_dereference(net->ipv4.mrt); return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) +static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { - return net->ipv4.mrt; + return rcu_dereference_check(net->ipv4.mrt, + lockdep_rtnl_is_held() || + !rcu_access_pointer(net->ipv4.mrt)); } -#define __ipmr_get_table ipmr_get_table +#define ipmr_for_each_table(mrt, net) \ + for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - *mrt = net->ipv4.mrt; + *mrt = rcu_dereference(net->ipv4.mrt); + if (!*mrt) + return -EAGAIN; return 0; } @@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net) mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); - net->ipv4.mrt = mrt; + + rcu_assign_pointer(net->ipv4.mrt, mrt); return 0; } @@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { - ipmr_free_table(net->ipv4.mrt, dev_kill_list); + struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1); - net->ipv4.mrt = NULL; + RCU_INIT_POINTER(net->ipv4.mrt, NULL); + ipmr_free_table(mrt, dev_kill_list); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule) EXPORT_SYMBOL(ipmr_rule_default); #endif +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ipmr_get_table(net, id); + rcu_read_unlock(); + + return mrt; +} + static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { @@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis WARN_ON_ONCE(!mr_can_free_table(net)); - timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, &ipmr_dev_kill_list); - rhltable_destroy(&mrt->mfc_hash); - kfree(mrt); + timer_shutdown_sync(&mrt->ipmr_expire_timer); + mr_table_free(mrt); WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); list_splice(&ipmr_dev_kill_list, dev_kill_list); @@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt, static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { + struct net *net = read_pnet(&mrt->net); const struct iphdr *iph = ip_hdr(skb); - struct mfc_cache *c; + struct mfc_cache *c = NULL; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); + + if (!check_net(net)) { + err = -EINVAL; + goto err; + } + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { @@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { - spin_unlock_bh(&mfc_unres_lock); - - kfree_skb(skb); - return -ENOBUFS; + err = -ENOBUFS; + goto err; } /* Fill in the new cache entry */ @@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); - - if (err < 0) { - /* If the report failed throw the cache entry - out - Brad Parker - */ - spin_unlock_bh(&mfc_unres_lock); - - ipmr_cache_free(c); - kfree_skb(skb); - return err; - } + if (err < 0) + goto err; atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); @@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { - kfree_skb(skb); + c = NULL; err = -ENOBUFS; - } else { - if (dev) { - skb->dev = dev; - skb->skb_iif = dev->ifindex; - } - skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); - err = 0; + goto err; + } + + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; } + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); + spin_unlock_bh(&mfc_unres_lock); + return 0; + +err: + spin_unlock_bh(&mfc_unres_lock); + if (c) + ipmr_cache_free(c); + kfree_skb(skb); return err; } @@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, } if (flags & MRT_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 37a3c144276c..3930d612c3de 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v, v->link = dev->ifindex; } +static void __mr_free_table(struct work_struct *work) +{ + struct mr_table *mrt = container_of(to_rcu_work(work), + struct mr_table, work); + + rhltable_destroy(&mrt->mfc_hash); + kfree(mrt); +} + +void mr_table_free(struct mr_table *mrt) +{ + queue_rcu_work(system_unbound_wq, &mrt->work); +} + struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, @@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id, kfree(mrt); return ERR_PTR(err); } + + INIT_RCU_WORK(&mrt->work, __mr_free_table); INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); -- cgit v1.2.3 From 735a309b4bfb9e1e26636ff4a3e8a146f53c54f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2026 19:53:20 -0700 Subject: net: add net_iov_init() and use it to initialize ->page_type Commit db359fccf212 ("mm: introduce a new page type for page pool in page type") added a page_type field to struct net_iov at the same offset as struct page::page_type, so that page_pool_set_pp_info() can call __SetPageNetpp() uniformly on both pages and net_iovs. The page-type API requires the field to hold the UINT_MAX "no type" sentinel before a type can be set; for real struct page that invariant is established by the page allocator on free. struct net_iov is not allocated through the page allocator, so the field is left as zero (io_uring zcrx, which uses __GFP_ZERO) or as slab garbage (devmem, which uses kvmalloc_objs() without zeroing). When the page pool then calls page_pool_set_pp_info() on a freshly-bound niov, __SetPageNetpp()'s VM_BUG_ON_PAGE(page->page_type != UINT_MAX) fires and the kernel BUGs. Triggered in selftests by io_uring zcrx setup through the fbnic queue restart path: kernel BUG at ./include/linux/page-flags.h:1062! RIP: 0010:page_pool_set_pp_info (./include/linux/page-flags.h:1062 net/core/page_pool.c:716) Call Trace: net_mp_niov_set_page_pool (net/core/page_pool.c:1360) io_pp_zc_alloc_netmems (io_uring/zcrx.c:1089 io_uring/zcrx.c:1110) fbnic_fill_bdq (./include/net/page_pool/helpers.h:160 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:906) __fbnic_nv_restart (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2470 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2874) fbnic_queue_start (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2903) netdev_rx_queue_reconfig (net/core/netdev_rx_queue.c:137) __netif_mp_open_rxq (net/core/netdev_rx_queue.c:234) io_register_zcrx (io_uring/zcrx.c:818 io_uring/zcrx.c:903) __io_uring_register (io_uring/register.c:931) __do_sys_io_uring_register (io_uring/register.c:1029) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) The same path is reachable through devmem dmabuf binding via netdev_nl_bind_rx_doit() -> net_devmem_bind_dmabuf_to_queue(). Add a net_iov_init() helper that stamps ->owner, ->type and the ->page_type sentinel, and use it from both the devmem and io_uring zcrx niov init loops. Fixes: db359fccf212 ("mm: introduce a new page type for page pool in page type") Acked-by: Vlastimil Babka (SUSE) Acked-by: Byungchul Park Reviewed-by: Jens Axboe Acked-by: Pavel Begunkov Link: https://patch.msgid.link/20260428025320.853452-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 15 +++++++++++++++ io_uring/zcrx.c | 3 +-- net/core/devmem.c | 3 +-- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index 507b74c9f52d..78fe51e5756b 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -127,6 +127,21 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) return niov - net_iov_owner(niov)->niovs; } +/* Initialize a niov: stamp the owning area, the memory provider type, + * and the page_type "no type" sentinel expected by the page-type API + * (see PAGE_TYPE_OPS in ) so that + * page_pool_set_pp_info() can later call __SetPageNetpp() on a niov + * cast to struct page. + */ +static inline void net_iov_init(struct net_iov *niov, + struct net_iov_area *owner, + enum net_iov_type type) +{ + niov->owner = owner; + niov->type = type; + niov->page_type = UINT_MAX; +} + /* netmem */ /** diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7b93c87b8371..19837e0b5e91 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -495,10 +495,9 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; - niov->owner = &area->nia; + net_iov_init(niov, &area->nia, NET_IOV_IOURING); area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); - niov->type = NET_IOV_IOURING; } if (ifq->dev) { diff --git a/net/core/devmem.c b/net/core/devmem.c index cde4c89bc146..468344739db2 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -297,8 +297,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; - niov->type = NET_IOV_DMABUF; - niov->owner = &owner->area; + net_iov_init(niov, &owner->area, NET_IOV_DMABUF); page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE) -- cgit v1.2.3 From c4f050ce06c56cfb5993268af4a5cb66ed1cd04e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2026 12:32:07 +0000 Subject: bonding: 3ad: implement proper RCU rules for port->aggregator syzbot found a data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler [1] which hints at lack of proper RCU implementation. Add __rcu qualifier to port->aggregator, and add proper RCU API. [1] BUG: KCSAN: data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler write to 0xffff88813cf5c4b0 of 8 bytes by task 36 on cpu 0: ad_port_selection_logic drivers/net/bonding/bond_3ad.c:1659 [inline] bond_3ad_state_machine_handler+0x9d5/0x2d60 drivers/net/bonding/bond_3ad.c:2569 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0x4f0/0x9c0 kernel/workqueue.c:3385 worker_thread+0x58a/0x780 kernel/workqueue.c:3466 kthread+0x22a/0x280 kernel/kthread.c:436 ret_from_fork+0x146/0x330 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 read to 0xffff88813cf5c4b0 of 8 bytes by task 22063 on cpu 1: __bond_3ad_get_active_agg_info drivers/net/bonding/bond_3ad.c:2858 [inline] bond_3ad_get_active_agg_info+0x8c/0x230 drivers/net/bonding/bond_3ad.c:2881 bond_fill_info+0xe0f/0x10f0 drivers/net/bonding/bond_netlink.c:853 rtnl_link_info_fill net/core/rtnetlink.c:906 [inline] rtnl_link_fill+0x1d7/0x4e0 net/core/rtnetlink.c:927 rtnl_fill_ifinfo+0xf8e/0x1380 net/core/rtnetlink.c:2168 rtmsg_ifinfo_build_skb+0x11c/0x1b0 net/core/rtnetlink.c:4453 rtmsg_ifinfo_event net/core/rtnetlink.c:4486 [inline] rtmsg_ifinfo+0x6d/0x110 net/core/rtnetlink.c:4495 __dev_notify_flags+0x76/0x390 net/core/dev.c:9790 netif_change_flags+0xac/0xd0 net/core/dev.c:9823 do_setlink+0x905/0x2950 net/core/rtnetlink.c:3180 rtnl_group_changelink net/core/rtnetlink.c:3813 [inline] __rtnl_newlink net/core/rtnetlink.c:3981 [inline] rtnl_newlink+0xf55/0x1400 net/core/rtnetlink.c:4109 rtnetlink_rcv_msg+0x64b/0x720 net/core/rtnetlink.c:6995 netlink_rcv_skb+0x123/0x220 net/netlink/af_netlink.c:2550 rtnetlink_rcv+0x1c/0x30 net/core/rtnetlink.c:7022 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x5a8/0x680 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x5c8/0x6f0 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:787 [inline] __sock_sendmsg net/socket.c:802 [inline] ____sys_sendmsg+0x563/0x5b0 net/socket.c:2698 ___sys_sendmsg+0x195/0x1e0 net/socket.c:2752 __sys_sendmsg net/socket.c:2784 [inline] __do_sys_sendmsg net/socket.c:2789 [inline] __se_sys_sendmsg net/socket.c:2787 [inline] __x64_sys_sendmsg+0xd4/0x160 net/socket.c:2787 x64_sys_call+0x194c/0x3020 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x12c/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x0000000000000000 -> 0xffff88813cf5c400 Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 22063 Comm: syz.0.31122 Tainted: G W syzkaller #0 PREEMPT(full) Tainted: [W]=WARN Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Fixes: 47e91f56008b ("bonding: use RCU protection for 3ad xmit path") Reported-by: syzbot+9bb2ff2a4ab9e17307e1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f0a82f.050a0220.3aadc4.0000.GAE@google.com/ Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Andrew Lunn Link: https://patch.msgid.link/20260428123207.3809211-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_3ad.c | 109 ++++++++++++++++++--------------- drivers/net/bonding/bond_main.c | 8 ++- drivers/net/bonding/bond_netlink.c | 16 +++-- drivers/net/bonding/bond_procfs.c | 3 +- drivers/net/bonding/bond_sysfs_slave.c | 17 +++-- include/net/bond_3ad.h | 2 +- 6 files changed, 89 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index af7f74cfdc08..f0aa7d2f2171 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1029,6 +1029,7 @@ static void ad_cond_set_peer_notif(struct port *port) static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); + struct aggregator *aggregator; mux_states_t last_state; /* keep current State Machine state to compare later if it was @@ -1036,6 +1037,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ last_state = port->sm_mux_state; + aggregator = rcu_dereference(port->aggregator); if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { @@ -1055,7 +1057,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } @@ -1070,7 +1072,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * update ready variable, we check READY_N and update * READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state @@ -1086,7 +1088,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { - if (port->aggregator->is_active) { + if (aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) @@ -1102,9 +1104,9 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; - } else if (port->aggregator->is_active) { + } else if (aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } @@ -1115,7 +1117,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator->is_active && + if (aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1134,7 +1136,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ struct slave *slave = port->slave; - if (port->aggregator->is_active && + if (aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; @@ -1154,8 +1156,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator && - port->aggregator->is_active && + if (aggregator && + aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1187,7 +1189,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: - if (port->aggregator->is_active) + if (aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else @@ -1561,9 +1563,9 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ - if (port->aggregator) { + temp_aggregator = rcu_dereference(port->aggregator); + if (temp_aggregator) { /* detach the port from its former aggregator */ - temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { @@ -1586,7 +1588,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) /* clear the port's relations to this * aggregator */ - port->aggregator = NULL; + RCU_INIT_POINTER(port->aggregator, NULL); port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; @@ -1609,7 +1611,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, - port->aggregator->aggregator_identifier); + temp_aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ @@ -1633,15 +1635,15 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) ) ) { /* attach to the founded aggregator */ - port->aggregator = aggregator; + rcu_assign_pointer(port->aggregator, aggregator); port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; - port->aggregator->num_of_ports++; + aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; @@ -1656,39 +1658,40 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) if (!found) { if (free_aggregator) { /* assign port a new aggregator */ - port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + free_aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ - port->aggregator->is_individual = false; + free_aggregator->is_individual = false; else - port->aggregator->is_individual = true; + free_aggregator->is_individual = true; - port->aggregator->actor_admin_aggregator_key = + free_aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; - port->aggregator->actor_oper_aggregator_key = + free_aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; - port->aggregator->partner_system = + free_aggregator->partner_system = port->partner_oper.system; - port->aggregator->partner_system_priority = + free_aggregator->partner_system_priority = port->partner_oper.system_priority; - port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; - port->aggregator->receive_state = 1; - port->aggregator->transmit_state = 1; - port->aggregator->lag_ports = port; - port->aggregator->num_of_ports++; + free_aggregator->partner_oper_aggregator_key = port->partner_oper.key; + free_aggregator->receive_state = 1; + free_aggregator->transmit_state = 1; + free_aggregator->lag_ports = port; + free_aggregator->num_of_ports++; + + rcu_assign_pointer(port->aggregator, free_aggregator); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + free_aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", @@ -1700,13 +1703,12 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ - __set_agg_ports_ready(port->aggregator, - __agg_ports_are_ready(port->aggregator)); + aggregator = rcu_dereference(port->aggregator); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); - aggregator = __get_first_agg(port); - ad_agg_selection_logic(aggregator, update_slave_arr); + ad_agg_selection_logic(__get_first_agg(port), update_slave_arr); - if (!port->aggregator->is_active) + if (!aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } @@ -2075,13 +2077,15 @@ static void ad_initialize_port(struct port *port, const struct bond_params *bond */ static void ad_enable_collecting(struct port *port) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_collecting_port(port); } } @@ -2093,11 +2097,13 @@ static void ad_enable_collecting(struct port *port) */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2114,11 +2120,13 @@ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; @@ -2135,11 +2143,13 @@ static void ad_enable_collecting_distributing(struct port *port, static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2379,7 +2389,7 @@ void bond_3ad_unbind_slave(struct slave *slave) */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { - temp_port->aggregator = new_aggregator; + rcu_assign_pointer(temp_port->aggregator, new_aggregator); temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } @@ -2848,15 +2858,16 @@ out: int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { - struct aggregator *aggregator = NULL; + struct aggregator *aggregator = NULL, *tmp; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); - if (port->aggregator && port->aggregator->is_active) { - aggregator = port->aggregator; + tmp = rcu_dereference(port->aggregator); + if (tmp && tmp->is_active) { + aggregator = tmp; break; } } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c7baa5c4bf40..af82a3df2c5d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1433,7 +1433,7 @@ static void bond_poll_controller(struct net_device *bond_dev) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = - SLAVE_AD_INFO(slave)->port.aggregator; + rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg && agg->aggregator_identifier != ad_info.aggregator_id) @@ -5179,15 +5179,16 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } + rcu_read_lock(); bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { - struct aggregator *agg; + const struct aggregator *agg; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (!agg || agg->aggregator_identifier != agg_id) continue; } @@ -5199,6 +5200,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) usable_slaves->arr[usable_slaves->count++] = slave; } + rcu_read_unlock(); bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index ea1a80e658ae..c7d3e0602c83 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -66,27 +66,29 @@ static int bond_fill_slave_info(struct sk_buff *skb, const struct port *ad_port; ad_port = &SLAVE_AD_INFO(slave)->port; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg) { if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, agg->aggregator_identifier)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, ad_port->actor_oper_port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE, ad_port->sm_churn_actor_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE, ad_port->sm_churn_partner_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; } + rcu_read_unlock(); if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, SLAVE_AD_INFO(slave)->port_priority)) @@ -95,6 +97,8 @@ static int bond_fill_slave_info(struct sk_buff *skb, return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: return -EMSGSIZE; } diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index e34f80305191..3714aab1a3d9 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -188,6 +188,7 @@ static void bond_info_show_master(struct seq_file *seq) } } +/* Note: runs under rcu_read_lock() */ static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { @@ -214,7 +215,7 @@ static void bond_info_show_slave(struct seq_file *seq, if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; - const struct aggregator *agg = port->aggregator; + const struct aggregator *agg = rcu_dereference(port->aggregator); if (agg) { seq_printf(seq, "Aggregator ID: %d\n", diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index 36d0e8440b5b..fc6fe7181789 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -62,10 +62,15 @@ static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { - agg = SLAVE_AD_INFO(slave)->port.aggregator; - if (agg) - return sysfs_emit(buf, "%d\n", - agg->aggregator_identifier); + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); + if (agg) { + ssize_t res = sysfs_emit(buf, "%d\n", + agg->aggregator_identifier); + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); } return sysfs_emit(buf, "N/A\n"); @@ -78,7 +83,7 @@ static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } @@ -93,7 +98,7 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c92d4a976246..05572c19e14b 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -243,7 +243,7 @@ typedef struct port { churn_state_t sm_churn_actor_state; churn_state_t sm_churn_partner_state; struct slave *slave; /* pointer to the bond slave that this port belongs to */ - struct aggregator *aggregator; /* pointer to an aggregator that this port related to */ + struct aggregator __rcu *aggregator; /* pointer to an aggregator that this port related to */ struct port *next_port_in_aggregator; /* Next port on the linked list of the parent aggregator */ u32 transaction_id; /* continuous number for identification of Marker PDU's; */ struct lacpdu lacpdu; /* the lacpdu that will be sent for this port */ -- cgit v1.2.3 From 620055cb1036a6125fd912e7a14b47a6572b809b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 27 Apr 2026 22:22:21 -0700 Subject: dpll: export __dpll_pin_change_ntf() for use under dpll_lock Export __dpll_pin_change_ntf() so that drivers can send pin change notifications from within pin callbacks, which are already called under dpll_lock. Using dpll_pin_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Acked-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-9-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/dpll/dpll_netlink.c | 10 ++++++++++ drivers/dpll/dpll_netlink.h | 2 -- include/linux/dpll.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index af7ce62ec55c..0ff1658c2dc1 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -900,11 +900,21 @@ int dpll_pin_delete_ntf(struct dpll_pin *pin) return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin); } +/** + * __dpll_pin_change_ntf - notify that the pin has been changed + * @pin: registered pin pointer + * + * Context: caller must hold dpll_lock. Suitable for use inside pin + * callbacks which are already invoked under dpll_lock. + * Return: 0 if succeeds, error code otherwise. + */ int __dpll_pin_change_ntf(struct dpll_pin *pin) { + lockdep_assert_held(&dpll_lock); dpll_pin_notify(pin, DPLL_PIN_CHANGED); return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin); } +EXPORT_SYMBOL_GPL(__dpll_pin_change_ntf); /** * dpll_pin_change_ntf - notify that the pin has been changed diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h index dd28b56d27c5..a9cfd55f57fc 100644 --- a/drivers/dpll/dpll_netlink.h +++ b/drivers/dpll/dpll_netlink.h @@ -11,5 +11,3 @@ int dpll_device_delete_ntf(struct dpll_device *dpll); int dpll_pin_create_ntf(struct dpll_pin *pin); int dpll_pin_delete_ntf(struct dpll_pin *pin); - -int __dpll_pin_change_ntf(struct dpll_pin *pin); diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b7277a8b484d..f8037f1ab20b 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -286,6 +286,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, int dpll_device_change_ntf(struct dpll_device *dpll); +int __dpll_pin_change_ntf(struct dpll_pin *pin); int dpll_pin_change_ntf(struct dpll_pin *pin); int register_dpll_notifier(struct notifier_block *nb); -- cgit v1.2.3