From d851c12b60471188e15e5c8405b289073e8dd025 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:47:25 +0000 Subject: ipv4: Always invalidate or update the route on pmtu events Some protocols, like IPsec still cache routes. So we need to invalidate the old route on pmtu events to avoid the reuse of stale routes. We also need to update the mtu and expire time of the route if we already use a nh exception route, otherwise we ignore newly learned pmtu values after the first expiration. With this patch we always invalidate or update the route on pmtu events. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ff622069fcef..90ba8358a892 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -904,22 +904,29 @@ out: kfree_skb(skb); return 0; } -static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { + struct dst_entry *dst = &rt->dst; struct fib_result res; if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + if (!rt->rt_pmtu) { + dst->obsolete = DST_OBSOLETE_KILL; + } else { + rt->rt_pmtu = mtu; + dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); + } + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); - return mtu; } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -929,14 +936,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); - mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); - - if (!rt->rt_pmtu) { - dst->obsolete = DST_OBSOLETE_KILL; - } else { - rt->rt_pmtu = mtu; - rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); - } + __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, -- cgit v1.2.3 From 7f92d334ba19a0d8e96f8f8f092219553367d921 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:48:18 +0000 Subject: ipv4: Don't create nh exeption when the device mtu is smaller than the reported pmtu When a local tool like tracepath tries to send packets bigger than the device mtu, we create a nh exeption and set the pmtu to device mtu. The device mtu does not expire, so check if the device mtu is smaller than the reported pmtu and don't crerate a nh exeption in that case. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 90ba8358a892..741df67a81ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -909,6 +909,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) struct dst_entry *dst = &rt->dst; struct fib_result res; + if (dst->dev->mtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; -- cgit v1.2.3 From ee9a8f7ab2edf801b8b514c310455c94acc232f6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 8 Oct 2012 00:56:54 +0000 Subject: ipv4: Don't report stale pmtu values to userspace We report cached pmtu values even if they are already expired. Change this to not report these values after they are expired and fix a race in the expire time calculation, as suggested by Eric Dumazet. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 741df67a81ec..132e0dfee53a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2187,8 +2187,18 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; + + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt_pmtu) + if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; @@ -2198,13 +2208,6 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; error = rt->dst.error; - expires = rt->dst.expires; - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } if (rt_is_input_route(rt)) { if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) -- cgit v1.2.3 From e81da0e113a1b7fc7449ae6213f65f89ccac6d06 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:15 +0000 Subject: ipv4: fix sending of redirects After "Cache input routes in fib_info nexthops" (commit d2d68ba9fe) and "Elide fib_validate_source() completely when possible" (commit 7a9bc9b81a) we can not send ICMP redirects. It seems we should not cache the RTCF_DOREDIRECT flag in nh_rth_input because the same fib_info can be used for traffic that is not redirected, eg. from other input devices or from sources that are not in same subnet. As result, we have to disable the caching of RTCF_DOREDIRECT flag and to force source validation for the case when forwarding traffic to the input device. If traffic comes from directly connected source we allow redirection as it was done before both changes. Avoid setting RTCF_DOREDIRECT if IN_DEV_TX_REDIRECTS is disabled, this can avoid source address validation and to help caching the routes. After the change "Adjust semantics of rt->rt_gateway" (commit f8126f1d51) we should make sure our ICMP_REDIR_HOST messages contain daddr instead of 0.0.0.0 when target is directly connected. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/route.c | 30 ++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 68c93d1bb03a..825c608826de 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -322,7 +322,8 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (!r && !fib_num_tclassid_users(dev_net(dev))) { + if (!r && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { *itag = 0; return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 132e0dfee53a..b90da1bc2704 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -802,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } @@ -827,7 +828,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE @@ -835,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), - &ip_hdr(skb)->daddr, &rt->rt_gateway); + &ip_hdr(skb)->daddr, &gw); #endif } out_put_peer: @@ -1442,10 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - if (out_dev == in_dev && err && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1462,15 +1468,11 @@ static int __mkroute_input(struct sk_buff *skb, } } - do_cache = false; - if (res->fi) { - if (!itag) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - goto out; - } - do_cache = true; + if (do_cache) { + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; } } -- cgit v1.2.3 From f8a17175c63fd3e8b573719f7538816f8c96abf4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:17 +0000 Subject: ipv4: make sure nh_pcpu_rth_output is always allocated Avoid checking nh_pcpu_rth_output in fast path, abort fib_info creation on alloc_percpu failure. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 267753060ffc..71b125cd5db1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -840,6 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { nexthop_nh->nh_parent = fi; nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); + if (!nexthop_nh->nh_pcpu_rth_output) + goto failure; } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b90da1bc2704..5b0180f11b20 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1207,8 +1207,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; } else { - if (!nh->nh_pcpu_rth_output) - goto nocache; p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); } orig = *p; @@ -1223,7 +1221,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) * unsuccessful at storing this route into the cache * we really need to set it. */ -nocache: rt->dst.flags |= DST_NOCACHE; ret = false; } -- cgit v1.2.3 From 155e8336c373d14d87a7f91e356d85ef4b93b8f9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:18 +0000 Subject: ipv4: introduce rt_uses_gateway Add new flag to remember when route is via gateway. We will use it to allow rt_gateway to contain address of directly connected host for the cases when DST_NOCACHE is used or when the NH exception caches per-destination route without DST_NOCACHE flag, i.e. when routes are not used for other destinations. By this way we force the neighbour resolving to work with the routed destination but we can use different address in the packet, feature needed for IPVS-DR where original packet for virtual IP is routed via route to real IP. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/route.h | 3 ++- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 4 ++-- net/ipv4/route.c | 48 ++++++++++++++++++++++------------------- net/ipv4/xfrm4_policy.c | 1 + 6 files changed, 34 insertions(+), 28 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/route.h b/include/net/route.h index da22243d2760..bc40b633a5c4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -48,7 +48,8 @@ struct rtable { int rt_genid; unsigned int rt_flags; __u16 rt_type; - __u16 rt_is_input; + __u8 rt_is_input; + __u8 rt_uses_gateway; int rt_iif; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f0c5b9c1a957..d34ce2972c8f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -406,7 +406,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; @@ -442,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 7f35ac26a71a..694de3b7aebf 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gateway) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 24a29a39e9a8..6537a408a4fb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -193,7 +193,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); @@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5b0180f11b20..3a116cb0991a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1126,7 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - if (rt->rt_gateway && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1177,7 +1177,9 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_gateway = fnhe->fnhe_gw; - } + rt->rt_uses_gateway = 1; + } else if (!rt->rt_gateway) + rt->rt_gateway = daddr; orig = rcu_dereference(fnhe->fnhe_rth); rcu_assign_pointer(fnhe->fnhe_rth, rt); @@ -1186,13 +1188,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, fnhe->fnhe_stamp = jiffies; ret = true; - } else { - /* Routes we intend to cache in nexthop exception have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; } spin_unlock_bh(&fnhe_lock); @@ -1215,15 +1210,8 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (prev == orig) { if (orig) rt_free(orig); - } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + } else ret = false; - } return ret; } @@ -1284,8 +1272,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; + } dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; @@ -1294,8 +1284,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) cached = rt_cache_route(nh, rt); - } - if (unlikely(!cached)) + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } + } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID @@ -1363,6 +1363,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; @@ -1432,7 +1433,6 @@ static int __mkroute_input(struct sk_buff *skb, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { @@ -1488,6 +1488,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; @@ -1658,6 +1659,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1826,6 +1828,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -2104,6 +2107,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2182,7 +2186,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gateway && + if (rt->rt_uses_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 681ea2f413e2..05c5ab8d983c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -91,6 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -- cgit v1.2.3 From c92b96553a80c1dbe2ebe128bbe37c8f98f148bf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:19 +0000 Subject: ipv4: Add FLOWI_FLAG_KNOWN_NH Add flag to request that output route should be returned with known rt_gateway, in case we want to use it as nexthop for neighbour resolving. The returned route can be cached as follows: - in NH exception: because the cached routes are not shared with other destinations - in FIB NH: when using gateway because all destinations for NH share same gateway As last option, to return rt_gateway!=0 we have to set DST_NOCACHE. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/ipv4/route.c | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/flow.h b/include/net/flow.h index e1dd5082ec7e..628e11b98c58 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -21,6 +21,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_CAN_SLEEP 0x02 +#define FLOWI_FLAG_KNOWN_NH 0x04 __u32 flowic_secid; }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3a116cb0991a..1a0da8dc8180 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1762,6 +1762,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct in_device *in_dev; u16 type = res->type; struct rtable *rth; + bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) @@ -1798,24 +1799,36 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } fnhe = NULL; + do_cache = fi != NULL; if (fi) { struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); - fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + fnhe = find_exception(nh, fl4->daddr); if (fnhe) prth = &fnhe->fnhe_rth; - else - prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } rth = rcu_dereference(*prth); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; } } + +add: rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi); + do_cache); if (!rth) return ERR_PTR(-ENOBUFS); -- cgit v1.2.3 From 68aaed54e7682aef261d5c2cf99e85a9dbf33a84 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 10 Oct 2012 08:27:25 +0000 Subject: ipv4: fix route mark sparse warning Sparse complains about RTA_MARK which is should be host order according to include file and usage in iproute. net/ipv4/route.c:2223:46: warning: incorrect type in argument 3 (different base types) net/ipv4/route.c:2223:46: expected restricted __be32 [usertype] value net/ipv4/route.c:2223:46: got unsigned int [unsigned] [usertype] flowic_mark Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1a0da8dc8180..432f4bb77238 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2220,7 +2220,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; if (fl4->flowi4_mark && - nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; error = rt->dst.error; -- cgit v1.2.3 From 13d82bf50dce632355fcccafa4fe44a9b5e706d8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 17 Oct 2012 21:17:44 +0000 Subject: ipv4: Fix flushing of cached routing informations Currently we can not flush cached pmtu/redirect informations via the ipv4_sysctl_rtcache_flush sysctl. We need to check the rt_genid of the old route and reset the nh exeption if the old route is expired when we bind a new route to a nh exeption. Signed-off-by: Steffen Klassert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 432f4bb77238..a8c651216fa6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1163,8 +1163,12 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { - struct rtable *orig; - + struct rtable *orig = rcu_dereference(fnhe->fnhe_rth); + if (orig && rt_is_expired(orig)) { + fnhe->fnhe_gw = 0; + fnhe->fnhe_pmtu = 0; + fnhe->fnhe_expires = 0; + } if (fnhe->fnhe_pmtu) { unsigned long expires = fnhe->fnhe_expires; unsigned long diff = expires - jiffies; @@ -1181,7 +1185,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, } else if (!rt->rt_gateway) rt->rt_gateway = daddr; - orig = rcu_dereference(fnhe->fnhe_rth); rcu_assign_pointer(fnhe->fnhe_rth, rt); if (orig) rt_free(orig); -- cgit v1.2.3