From 0e09edcce7ad9c8120eb8462334e1c9e8f3be09a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 20 Jun 2019 17:36:37 -0700 Subject: ipv6: introduce RT6_LOOKUP_F_DST_NOREF flag in ip6_pol_route() This new flag is to instruct the route lookup function to not take refcnt on the dst entry. The user which does route lookup with this flag must properly use rcu protection. ip6_pol_route() is the major route lookup function for both tx and rx path. In this function: Do not take refcnt on dst if RT6_LOOKUP_F_DST_NOREF flag is set, and directly return the route entry. The caller should be holding rcu lock when using this flag, and decide whether to take refcnt or not. One note on the dst cache in the uncached_list: As uncached_list does not consume refcnt, one refcnt is always returned back to the caller even if RT6_LOOKUP_F_DST_NOREF flag is set. Uncached dst is only possible in the output path. So in such call path, caller MUST check if the dst is in the uncached_list before assuming that there is no refcnt taken on the returned dst. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7375a165fd98..82bced2fc1e3 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -36,6 +36,7 @@ struct route_info { #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 #define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 +#define RT6_LOOKUP_F_DST_NOREF 0x00000080 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header -- cgit v1.2.3 From d64a1f574a2957b4bcb06452d36cc1c6bf16e9fc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 20 Jun 2019 17:36:39 -0700 Subject: ipv6: honor RT6_LOOKUP_F_DST_NOREF in rule lookup logic This patch specifically converts the rule lookup logic to honor this flag and not release refcnt when traversing each rule and calling lookup() on each routing table. Similar to previous patch, we also need some special handling of dst entries in uncached list because there is always 1 refcnt taken for them even if RT6_LOOKUP_F_DST_NOREF flag is set. Signed-off-by: Wei Wang Signed-off-by: David S. Miller --- include/net/ip6_route.h | 10 ++++++++++ net/ipv6/fib6_rules.c | 12 +++++++----- net/ipv6/ip6_fib.c | 5 +++-- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 82bced2fc1e3..0709835c01ad 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -94,6 +94,16 @@ static inline struct dst_entry *ip6_route_output(struct net *net, return ip6_route_output_flags(net, sk, fl6, 0); } +/* Only conditionally release dst if flags indicates + * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. + */ +static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) +{ + if (!(flags & RT6_LOOKUP_F_DST_NOREF) || + !list_empty(&rt->rt6i_uncached)) + ip6_rt_put(rt); +} + struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1d16a01eccf5..5b1c9b5b9247 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -316,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; -- cgit v1.2.3 From 7d9e5f422150ed00de744e02a80734d74cc9704d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 20 Jun 2019 17:36:41 -0700 Subject: ipv6: convert major tx path to use RT6_LOOKUP_F_DST_NOREF For tx path, in most cases, we still have to take refcnt on the dst cause the caller is caching the dst somewhere. But it still is beneficial to make use of RT6_LOOKUP_F_DST_NOREF flag while doing the route lookup. It is cause this flag prevents manipulating refcnt on net->ipv6.ip6_null_entry when doing fib6_rule_lookup() to traverse each routing table. The null_entry is a shared object and constant updates on it cause false sharing. We converted the current major lookup function ip6_route_output_flags() to make use of RT6_LOOKUP_F_DST_NOREF. Together with the change in the rx path, we see noticable performance boost: I ran synflood tests between 2 hosts under the same switch. Both hosts have 20G mlx NIC, and 8 tx/rx queues. Sender sends pure SYN flood with random src IPs and ports using trafgen. Receiver has a simple TCP listener on the target port. Both hosts have multiple custom rules: - For incoming packets, only local table is traversed. - For outgoing packets, 3 tables are traversed to find the route. The packet processing rate on the receiver is as follows: - Before the fix: 3.78Mpps - After the fix: 5.50Mpps Signed-off-by: Wei Wang Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 +++-- include/net/ip6_route.h | 4 ++++ net/ipv6/route.c | 29 +++++++++++++++++++++++++++-- net/l3mdev/l3mdev.c | 7 +++---- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 11b9525dff27..69ef9cce5858 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1072,12 +1072,14 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct + * Note: Caller to this function must hold rcu_read_lock() and no refcnt + * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); - int flags = RT6_LOOKUP_F_IFACE; + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; @@ -1087,7 +1089,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; - dst_hold(dst); return dst; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 0709835c01ad..89ad7917b98d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,6 +84,10 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags); + struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 66fc69ef5909..3975ae8e2440 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2415,8 +2415,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2424,6 +2425,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2431,6 +2433,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2443,6 +2446,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index cfc9fcb97465..f35899d45a9a 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup + * This function does not hold refcnt on the returned dst. + * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, @@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct dst_entry *dst = NULL; struct net_device *dev; + WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); @@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); - - rcu_read_unlock(); } return dst; -- cgit v1.2.3