diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-05 14:41:36 +0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-06 07:39:38 +0400 |
commit | ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 (patch) | |
tree | 395e50547ffccc6b73e04a44190eb4b4f2d2316b /net/ipv4/route.c | |
parent | c2952c314b4fe61820ba8fd6c949eed636140d52 (diff) | |
download | linux-ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9.tar.xz |
fib: RCU conversion of fib_lookup()
fib_lookup() converted to be called in RCU protected context, no
reference taken and released on a contended cache line (fib_clntref)
fib_table_lookup() and fib_semantic_match() get an additional parameter.
struct fib_info gets an rcu_head field, and is freed after an rcu grace
period.
Stress test :
(Sending 160.000.000 UDP frames on same neighbour,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_HASH) (about same results for FIB_TRIE)
Before patch :
real 1m31.199s
user 0m13.761s
sys 23m24.780s
After patch:
real 1m5.375s
user 0m14.997s
sys 15m50.115s
Before patch Profile :
13044.00 15.4% __ip_route_output_key vmlinux
8438.00 10.0% dst_destroy vmlinux
5983.00 7.1% fib_semantic_match vmlinux
5410.00 6.4% fib_rules_lookup vmlinux
4803.00 5.7% neigh_lookup vmlinux
4420.00 5.2% _raw_spin_lock vmlinux
3883.00 4.6% rt_set_nexthop vmlinux
3261.00 3.9% _raw_read_lock vmlinux
2794.00 3.3% fib_table_lookup vmlinux
2374.00 2.8% neigh_resolve_output vmlinux
2153.00 2.5% dst_alloc vmlinux
1502.00 1.8% _raw_read_lock_bh vmlinux
1484.00 1.8% kmem_cache_alloc vmlinux
1407.00 1.7% eth_header vmlinux
1406.00 1.7% ipv4_dst_destroy vmlinux
1298.00 1.5% __copy_from_user_ll vmlinux
1174.00 1.4% dev_queue_xmit vmlinux
1000.00 1.2% ip_output vmlinux
After patch Profile :
13712.00 15.8% dst_destroy vmlinux
8548.00 9.9% __ip_route_output_key vmlinux
7017.00 8.1% neigh_lookup vmlinux
4554.00 5.3% fib_semantic_match vmlinux
4067.00 4.7% _raw_read_lock vmlinux
3491.00 4.0% dst_alloc vmlinux
3186.00 3.7% neigh_resolve_output vmlinux
3103.00 3.6% fib_table_lookup vmlinux
2098.00 2.4% _raw_read_lock_bh vmlinux
2081.00 2.4% kmem_cache_alloc vmlinux
2013.00 2.3% _raw_spin_lock vmlinux
1763.00 2.0% __copy_from_user_ll vmlinux
1763.00 2.0% ip_output vmlinux
1761.00 2.0% ipv4_dst_destroy vmlinux
1631.00 1.9% eth_header vmlinux
1440.00 1.7% _raw_read_unlock_bh vmlinux
Reference results, if IP route cache is enabled :
real 0m29.718s
user 0m10.845s
sys 7m37.341s
25213.00 29.5% __ip_route_output_key vmlinux
9011.00 10.5% dst_release vmlinux
4817.00 5.6% ip_push_pending_frames vmlinux
4232.00 5.0% ip_finish_output vmlinux
3940.00 4.6% udp_sendmsg vmlinux
3730.00 4.4% __copy_from_user_ll vmlinux
3716.00 4.4% ip_route_output_flow vmlinux
2451.00 2.9% __xfrm_lookup vmlinux
2221.00 2.6% ip_append_data vmlinux
1718.00 2.0% _raw_spin_lock_bh vmlinux
1655.00 1.9% __alloc_skb vmlinux
1572.00 1.8% sock_wfree vmlinux
1345.00 1.6% kfree vmlinux
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 59 |
1 files changed, 24 insertions, 35 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 04e0df82b88c..7864d0c48968 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->dst.dev, rt->rt_gateway, + else { + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) + src = FIB_RES_PREFSRC(res); + else + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } @@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned hash; __be32 spec_dst; int err = -EINVAL; - int free_res = 0; struct net * net = dev_net(dev); /* IP on this device is disabled. */ @@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - if ((err = fib_lookup(net, &fl, &res)) != 0) { + err = fib_lookup(net, &fl, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; } - free_res = 1; RT_CACHE_STAT_INC(in_slow_tot); @@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); + net->loopback_dev->ifindex, + dev, &spec_dst, &itag, skb->mark); if (err < 0) goto martian_source_keep_err; if (err) @@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -done: - if (free_res) - fib_res_put(&res); out: return err; brd_input: @@ -2226,7 +2226,7 @@ local_input: rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); - goto done; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); @@ -2249,21 +2249,21 @@ martian_destination: e_hostunreach: err = -EHOSTUNREACH; - goto done; + goto out; e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: err = -EINVAL; martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto done; + goto out; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2349,6 +2349,7 @@ skip_cache: } EXPORT_SYMBOL(ip_route_input_common); +/* called with rcu_read_lock() */ static int __mkroute_output(struct rtable **result, struct fib_result *res, const struct flowi *fl, @@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev_out); - if (!in_dev) { - rcu_read_unlock(); + if (!in_dev) return -EINVAL; - } + if (res->type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } + res->fi = NULL; } else if (res->type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, @@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result, * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); + if (res->fi && res->prefixlen < 4) res->fi = NULL; - } } @@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result, return 0; } +/* called with rcu_read_lock() */ static int ip_mkroute_output(struct rtable **rp, struct fib_result *res, const struct flowi *fl, @@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; - int free_res = 0; int err; @@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, err = -ENETUNREACH; goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; dev_out = net->loopback_dev; fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; @@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, make_route: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); - if (free_res) - fib_res_put(&res); out: return err; } |