diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-02-21 03:46:11 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-02-21 03:46:12 +0300 |
commit | 981f40458e7a6ffbdff1a09ece6099b3b49d08a5 (patch) | |
tree | 35d3a3a7992301b0ef4bbe9baaf1f9e1f203db60 /net/sched/cls_api.c | |
parent | 871489dd01b67483248edc8873c389a66e469f30 (diff) | |
parent | 6702782845a5bf381a19b204c369e63420041665 (diff) | |
download | linux-981f40458e7a6ffbdff1a09ece6099b3b49d08a5.tar.xz |
Merge branch 'net-sched-cls_api-support-hardware-miss-to-tc-action'
Paul Blakey says:
====================
net/sched: cls_api: Support hardware miss to tc action
This series adds support for hardware miss to instruct tc to continue execution
in a specific tc action instance on a filter's action list. The mlx5 driver patch
(besides the refactors) shows its usage instead of using just chain restore.
Currently a filter's action list must be executed all together or
not at all as driver are only able to tell tc to continue executing from a
specific tc chain, and not a specific filter/action.
This is troublesome with regards to action CT, where new connections should
be sent to software (via tc chain restore), and established connections can
be handled in hardware.
Checking for new connections is done when executing the ct action in hardware
(by checking the packet's tuple against known established tuples).
But if there is a packet modification (pedit) action before action CT and the
checked tuple is a new connection, hardware will need to revert the previous
packet modifications before sending it back to software so it can
re-match the same tc filter in software and re-execute its CT action.
The following is an example configuration of stateless nat
on mlx5 driver that isn't supported before this patchet:
#Setup corrosponding mlx5 VFs in namespaces
$ ip netns add ns0
$ ip netns add ns1
$ ip link set dev enp8s0f0v0 netns ns0
$ ip netns exec ns0 ifconfig enp8s0f0v0 1.1.1.1/24 up
$ ip link set dev enp8s0f0v1 netns ns1
$ ip netns exec ns1 ifconfig enp8s0f0v1 1.1.1.2/24 up
#Setup tc arp and ct rules on mxl5 VF representors
$ tc qdisc add dev enp8s0f0_0 ingress
$ tc qdisc add dev enp8s0f0_1 ingress
$ ifconfig enp8s0f0_0 up
$ ifconfig enp8s0f0_1 up
#Original side
$ tc filter add dev enp8s0f0_0 ingress chain 0 proto ip flower \
ct_state -trk ip_proto tcp dst_port 8888 \
action pedit ex munge tcp dport set 5001 pipe \
action csum ip tcp pipe \
action ct pipe \
action goto chain 1
$ tc filter add dev enp8s0f0_0 ingress chain 1 proto ip flower \
ct_state +trk+est \
action mirred egress redirect dev enp8s0f0_1
$ tc filter add dev enp8s0f0_0 ingress chain 1 proto ip flower \
ct_state +trk+new \
action ct commit pipe \
action mirred egress redirect dev enp8s0f0_1
$ tc filter add dev enp8s0f0_0 ingress chain 0 proto arp flower \
action mirred egress redirect dev enp8s0f0_1
#Reply side
$ tc filter add dev enp8s0f0_1 ingress chain 0 proto arp flower \
action mirred egress redirect dev enp8s0f0_0
$ tc filter add dev enp8s0f0_1 ingress chain 0 proto ip flower \
ct_state -trk ip_proto tcp \
action ct pipe \
action pedit ex munge tcp sport set 8888 pipe \
action csum ip tcp pipe \
action mirred egress redirect dev enp8s0f0_0
#Run traffic
$ ip netns exec ns1 iperf -s -p 5001&
$ sleep 2 #wait for iperf to fully open
$ ip netns exec ns0 iperf -c 1.1.1.2 -p 8888
#dump tc filter stats on enp8s0f0_0 chain 0 rule and see hardware packets:
$ tc -s filter show dev enp8s0f0_0 ingress chain 0 proto ip | grep "hardware.*pkt"
Sent hardware 9310116832 bytes 6149672 pkt
Sent hardware 9310116832 bytes 6149672 pkt
Sent hardware 9310116832 bytes 6149672 pkt
A new connection executing the first filter in hardware will first rewrite
the dst port to the new port, and then the ct action is executed,
because this is a new connection, hardware will need to be send this back
to software, on chain 0, to execute the first filter again in software.
The dst port needs to be reverted otherwise it won't re-match the old
dst port in the first filter. Because of that, currently mlx5 driver will
reject offloading the above action ct rule.
This series adds support for hardware partially executing a filter's action list,
and letting tc software continue processing in the specific action instance
where hardware left off (in the above case after the "action pedit ex munge tcp
dport... of the first rule") allowing support for scenarios such as the above.
====================
Link: https://lore.kernel.org/r/20230217223620.28508-1-paulb@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/sched/cls_api.c')
-rw-r--r-- | net/sched/cls_api.c | 243 |
1 files changed, 219 insertions, 24 deletions
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index bfabc9c95fa9..3569e2c3660c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,6 +22,7 @@ #include <linux/idr.h> #include <linux/jhash.h> #include <linux/rculist.h> +#include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -50,6 +51,109 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +static struct xarray tcf_exts_miss_cookies_xa; +struct tcf_exts_miss_cookie_node { + const struct tcf_chain *chain; + const struct tcf_proto *tp; + const struct tcf_exts *exts; + u32 chain_index; + u32 tp_prio; + u32 handle; + u32 miss_cookie_base; + struct rcu_head rcu; +}; + +/* Each tc action entry cookie will be comprised of 32bit miss_cookie_base + + * action index in the exts tc actions array. + */ +union tcf_exts_miss_cookie { + struct { + u32 miss_cookie_base; + u32 act_index; + }; + u64 miss_cookie; +}; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static int +tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, + u32 handle) +{ + struct tcf_exts_miss_cookie_node *n; + static u32 next; + int err; + + if (WARN_ON(!handle || !tp->ops->get_exts)) + return -EINVAL; + + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->chain_index = tp->chain->index; + n->chain = tp->chain; + n->tp_prio = tp->prio; + n->tp = tp; + n->exts = exts; + n->handle = handle; + + err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, + n, xa_limit_32b, &next, GFP_KERNEL); + if (err) + goto err_xa_alloc; + + exts->miss_cookie_node = n; + return 0; + +err_xa_alloc: + kfree(n); + return err; +} + +static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) +{ + struct tcf_exts_miss_cookie_node *n; + + if (!exts->miss_cookie_node) + return; + + n = exts->miss_cookie_node; + xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base); + kfree_rcu(n, rcu); +} + +static struct tcf_exts_miss_cookie_node * +tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index) +{ + union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, }; + + *act_index = mc.act_index; + return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base); +} +#else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ +static int +tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, + u32 handle) +{ + return 0; +} + +static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) +{ +} +#endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ + +static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index) +{ + union tcf_exts_miss_cookie mc = { .act_index = act_index, }; + + if (!miss_cookie_base) + return 0; + + mc.miss_cookie_base = miss_cookie_base; + return mc.miss_cookie; +} + #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc); EXPORT_SYMBOL(tc_skb_ext_tc); @@ -1549,6 +1653,8 @@ static inline int __tcf_classify(struct sk_buff *skb, const struct tcf_proto *orig_tp, struct tcf_result *res, bool compat_mode, + struct tcf_exts_miss_cookie_node *n, + int act_index, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT @@ -1560,13 +1666,36 @@ reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { __be16 protocol = skb_protocol(skb, false); - int err; + int err = 0; - if (tp->protocol != protocol && - tp->protocol != htons(ETH_P_ALL)) - continue; + if (n) { + struct tcf_exts *exts; + + if (n->tp_prio != tp->prio) + continue; + + /* We re-lookup the tp and chain based on index instead + * of having hard refs and locks to them, so do a sanity + * check if any of tp,chain,exts was replaced by the + * time we got here with a cookie from hardware. + */ + if (unlikely(n->tp != tp || n->tp->chain != n->chain || + !tp->ops->get_exts)) + return TC_ACT_SHOT; + + exts = tp->ops->get_exts(tp, n->handle); + if (unlikely(!exts || n->exts != exts)) + return TC_ACT_SHOT; - err = tc_classify(skb, tp, res); + n = NULL; + err = tcf_exts_exec_ex(skb, exts, act_index, res); + } else { + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + + err = tc_classify(skb, tp, res); + } #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; @@ -1582,6 +1711,9 @@ reclassify: return err; } + if (unlikely(n)) + return TC_ACT_SHOT; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: @@ -1606,21 +1738,35 @@ int tcf_classify(struct sk_buff *skb, #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; - return __tcf_classify(skb, tp, tp, res, compat_mode, + return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0, &last_executed_chain); #else u32 last_executed_chain = tp ? tp->chain->index : 0; + struct tcf_exts_miss_cookie_node *n = NULL; const struct tcf_proto *orig_tp = tp; struct tc_skb_ext *ext; + int act_index = 0; int ret; if (block) { ext = skb_ext_find(skb, TC_SKB_EXT); - if (ext && ext->chain) { + if (ext && (ext->chain || ext->act_miss)) { struct tcf_chain *fchain; + u32 chain; + + if (ext->act_miss) { + n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, + &act_index); + if (!n) + return TC_ACT_SHOT; - fchain = tcf_chain_lookup_rcu(block, ext->chain); + chain = n->chain_index; + } else { + chain = ext->chain; + } + + fchain = tcf_chain_lookup_rcu(block, chain); if (!fchain) return TC_ACT_SHOT; @@ -1632,7 +1778,7 @@ int tcf_classify(struct sk_buff *skb, } } - ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, + ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index, &last_executed_chain); if (tc_skb_ext_tc_enabled()) { @@ -3056,9 +3202,48 @@ out: return skb->len; } +int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, + int police, struct tcf_proto *tp, u32 handle, + bool use_action_miss) +{ + int err = 0; + +#ifdef CONFIG_NET_CLS_ACT + exts->type = 0; + exts->nr_actions = 0; + /* Note: we do not own yet a reference on net. + * This reference might be taken later from tcf_exts_get_net(). + */ + exts->net = net; + exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), + GFP_KERNEL); + if (!exts->actions) + return -ENOMEM; +#endif + + exts->action = action; + exts->police = police; + + if (!use_action_miss) + return 0; + + err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle); + if (err) + goto err_miss_alloc; + + return 0; + +err_miss_alloc: + tcf_exts_destroy(exts); + return err; +} +EXPORT_SYMBOL(tcf_exts_init_ex); + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT + tcf_exts_miss_cookie_base_destroy(exts); + if (exts->actions) { tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); @@ -3490,28 +3675,28 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, } EXPORT_SYMBOL(tc_setup_cb_reoffload); -static int tcf_act_get_cookie(struct flow_action_entry *entry, - const struct tc_action *act) +static int tcf_act_get_user_cookie(struct flow_action_entry *entry, + const struct tc_action *act) { - struct tc_cookie *cookie; + struct tc_cookie *user_cookie; int err = 0; rcu_read_lock(); - cookie = rcu_dereference(act->act_cookie); - if (cookie) { - entry->cookie = flow_action_cookie_create(cookie->data, - cookie->len, - GFP_ATOMIC); - if (!entry->cookie) + user_cookie = rcu_dereference(act->user_cookie); + if (user_cookie) { + entry->user_cookie = flow_action_cookie_create(user_cookie->data, + user_cookie->len, + GFP_ATOMIC); + if (!entry->user_cookie) err = -ENOMEM; } rcu_read_unlock(); return err; } -static void tcf_act_put_cookie(struct flow_action_entry *entry) +static void tcf_act_put_user_cookie(struct flow_action_entry *entry) { - flow_action_cookie_destroy(entry->cookie); + flow_action_cookie_destroy(entry->user_cookie); } void tc_cleanup_offload_action(struct flow_action *flow_action) @@ -3520,7 +3705,7 @@ void tc_cleanup_offload_action(struct flow_action *flow_action) int i; flow_action_for_each(i, entry, flow_action) { - tcf_act_put_cookie(entry); + tcf_act_put_user_cookie(entry); if (entry->destructor) entry->destructor(entry->destructor_priv); } @@ -3547,6 +3732,7 @@ static int tc_setup_offload_act(struct tc_action *act, int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], + u32 miss_cookie_base, struct netlink_ext_ack *extack) { int i, j, k, index, err = 0; @@ -3565,7 +3751,7 @@ int tc_setup_action(struct flow_action *flow_action, entry = &flow_action->entries[j]; spin_lock_bh(&act->tcfa_lock); - err = tcf_act_get_cookie(entry, act); + err = tcf_act_get_user_cookie(entry, act); if (err) goto err_out_locked; @@ -3577,7 +3763,9 @@ int tc_setup_action(struct flow_action *flow_action, for (k = 0; k < index ; k++) { entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); entry[k].hw_index = act->tcfa_index; - entry[k].act_cookie = (unsigned long)act; + entry[k].cookie = (unsigned long)act; + entry[k].miss_cookie = + tcf_exts_miss_cookie_get(miss_cookie_base, i); } j += index; @@ -3600,10 +3788,15 @@ int tc_setup_offload_action(struct flow_action *flow_action, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT + u32 miss_cookie_base; + if (!exts) return 0; - return tc_setup_action(flow_action, exts->actions, extack); + miss_cookie_base = exts->miss_cookie_node ? + exts->miss_cookie_node->miss_cookie_base : 0; + return tc_setup_action(flow_action, exts->actions, miss_cookie_base, + extack); #else return 0; #endif @@ -3771,6 +3964,8 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, |