From bc9d3a9f2afca189a6ae40225b6985e3c775375e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:32 +0100 Subject: net: dst: Switch to rcuref_t reference counting Under high contention dst_entry::__refcnt becomes a significant bottleneck. atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into high retry rates on contention. Switch the reference count to rcuref_t which results in a significant performance gain. Rename the reference count member to __rcuref to reflect the change. The gain depends on the micro-architecture and the number of concurrent operations and has been measured in the range of +25% to +130% with a localhost memtier/memcached benchmark which amplifies the problem massively. Running the memtier/memcached benchmark over a real (1Gb) network connection the conversion on top of the false sharing fix for struct dst_entry::__refcnt results in a total gain in the 2%-5% range over the upstream baseline. Reported-by: Wangyang Guo Reported-by: Arjan Van De Ven Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de Signed-off-by: Jakub Kicinski --- net/core/dst.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'net/core/dst.c') diff --git a/net/core/dst.c b/net/core/dst.c index 31c08a3386d3..3247e84045ca 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->tclassid = 0; #endif dst->lwtstate = NULL; - atomic_set(&dst->__refcnt, initial_ref); + rcuref_init(&dst->__rcuref, initial_ref); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; @@ -162,31 +162,15 @@ EXPORT_SYMBOL(dst_dev_put); void dst_release(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); - } + if (dst && rcuref_put(&dst->__rcuref)) + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst) { - int newrefcnt; - - newrefcnt = atomic_dec_return(&dst->__refcnt); - if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", - __func__, dst, newrefcnt); - if (!newrefcnt) - dst_destroy(dst); - } + if (dst && rcuref_put(&dst->__rcuref)) + dst_destroy(dst); } EXPORT_SYMBOL(dst_release_immediate); -- cgit v1.2.3 From 418a73074da9182f571e467eaded03ea501f3281 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Thu, 20 Apr 2023 20:25:08 +0200 Subject: net: dst: fix missing initialization of rt_uncached xfrm_alloc_dst() followed by xfrm4_dst_destroy(), without a xfrm4_fill_dst() call in between, causes the following BUG: BUG: spinlock bad magic on CPU#0, fbxhostapd/732 lock: 0x890b7668, .magic: 890b7668, .owner: /-1, .owner_cpu: 0 CPU: 0 PID: 732 Comm: fbxhostapd Not tainted 6.3.0-rc6-next-20230414-00613-ge8de66369925-dirty #9 Hardware name: Marvell Kirkwood (Flattened Device Tree) unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x28/0x30 dump_stack_lvl from do_raw_spin_lock+0x20/0x80 do_raw_spin_lock from rt_del_uncached_list+0x30/0x64 rt_del_uncached_list from xfrm4_dst_destroy+0x3c/0xbc xfrm4_dst_destroy from dst_destroy+0x5c/0xb0 dst_destroy from rcu_process_callbacks+0xc4/0xec rcu_process_callbacks from __do_softirq+0xb4/0x22c __do_softirq from call_with_stack+0x1c/0x24 call_with_stack from do_softirq+0x60/0x6c do_softirq from __local_bh_enable_ip+0xa0/0xcc Patch "net: dst: Prevent false sharing vs. dst_entry:: __refcnt" moved rt_uncached and rt_uncached_list fields from rtable struct to dst struct, so they are more zeroed by memset_after(xdst, 0, u.dst) in xfrm_alloc_dst(). Note that rt_uncached (list_head) was never properly initialized at alloc time, but xfrm[46]_dst_destroy() is written in such a way that it was not an issue thanks to the memset: if (xdst->u.rt.dst.rt_uncached_list) rt_del_uncached_list(&xdst->u.rt); The route code does it the other way around: rt_uncached_list is assumed to be valid IIF rt_uncached list_head is not empty: void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } This patch adds mandatory rt_uncached list_head initialization in generic dst_init(), and adapt xfrm[46]_dst_destroy logic to match the rest of the code. Fixes: d288a162dd1c ("net: dst: Prevent false sharing vs. dst_entry:: __refcnt") Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202304162125.18b7bcdd-oliver.sang@intel.com Reviewed-by: David Ahern Reviewed-by: Eric Dumazet CC: Leon Romanovsky Signed-off-by: Maxime Bizon Link: https://lore.kernel.org/r/20230420182508.2417582-1-mbizon@freebox.fr Signed-off-by: Jakub Kicinski --- net/core/dst.c | 1 + net/ipv4/route.c | 4 ---- net/ipv4/xfrm4_policy.c | 4 +--- net/ipv6/route.c | 1 - net/ipv6/xfrm6_policy.c | 4 +--- 5 files changed, 3 insertions(+), 11 deletions(-) (limited to 'net/core/dst.c') diff --git a/net/core/dst.c b/net/core/dst.c index 3247e84045ca..79d9306ad1ee 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -67,6 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, #endif dst->lwtstate = NULL; rcuref_init(&dst->__rcuref, initial_ref); + INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2a3d14d95ada..98d7e6ba7493 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1644,7 +1644,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; - INIT_LIST_HEAD(&rt->dst.rt_uncached); rt->dst.output = ip_output; if (flags & RTCF_LOCAL) @@ -1675,7 +1674,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; - INIT_LIST_HEAD(&new_rt->dst.rt_uncached); new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; @@ -2858,8 +2856,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; - - INIT_LIST_HEAD(&rt->dst.rt_uncached); } dst_release(dst_orig); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 47861c8b7340..9403bbaf1b61 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; - INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached); rt_add_uncached_list(&xdst->u.rt); return 0; @@ -121,8 +120,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); - if (xdst->u.rt.dst.rt_uncached_list) - rt_del_uncached_list(&xdst->u.rt); + rt_del_uncached_list(&xdst->u.rt); xfrm_dst_destroy(xdst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 35085fc0cf15..e3aec46bd466 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -334,7 +334,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); - INIT_LIST_HEAD(&rt->dst.rt_uncached); } /* allocate dst with ip6_dst_ops */ diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 2b493f8d0091..eecc5e59da17 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -89,7 +89,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; - INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached); rt6_uncached_list_add(&xdst->u.rt6); return 0; @@ -121,8 +120,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); - if (xdst->u.rt6.dst.rt_uncached_list) - rt6_uncached_list_del(&xdst->u.rt6); + rt6_uncached_list_del(&xdst->u.rt6); xfrm_dst_destroy(xdst); } -- cgit v1.2.3