diff options
Diffstat (limited to 'net/ipv4')
60 files changed, 2113 insertions, 3092 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a5a1050595d1..cbb505ba9324 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER If unsure, say N here. -choice - prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" - depends on IP_ADVANCED_ROUTER - default ASK_IP_FIB_HASH - -config ASK_IP_FIB_HASH - bool "FIB_HASH" - ---help--- - Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE - bool "FIB_TRIE" - ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, - June 1999 - - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - <http://www.csc.kth.se/~snilsson/software/dyntrie2/> - -endchoice - -config IP_FIB_HASH - def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER - config IP_FIB_TRIE_STATS bool "FIB TRIE statistics" - depends on IP_FIB_TRIE + depends on IP_ADVANCED_ROUTER ---help--- Keep track of statistics on structure of FIB TRIE table. Useful for testing and measuring TRIE performance. @@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -657,4 +624,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4978d22f9a75..0dc772d0d125 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o \ + fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o -obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 45b89d7bda5a..807d83c02ef6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1101,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly; static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; __be32 old_saddr = inet->inet_saddr; - __be32 new_saddr; __be32 daddr = inet->inet_daddr; + struct rtable *rt; + __be32 new_saddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, 0); - if (err) - return err; + rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, sk->sk_protocol, + inet->inet_sport, inet->inet_dport, sk, false); + if (IS_ERR(rt)) + return PTR_ERR(rt); sk_setup_caps(sk, &rt->dst); @@ -1160,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk) daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; -{ - struct flowi fl = { - .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport, - }; - - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); -} - if (!err) + rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr, + inet->inet_dport, inet->inet_sport, + sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (!IS_ERR(rt)) { + err = 0; sk_setup_caps(sk, &rt->dst); - else { + } else { + err = PTR_ERR(rt); + /* Routing failed... */ sk->sk_route_caps = 0; /* @@ -1231,7 +1219,7 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 86961bec70ab..4286fd3cc0e2 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + if (x->props.flags & XFRM_STATE_ALIGN4) + ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + else + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, 0, skb->len); @@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) - goto out; + if (x->props.flags & XFRM_STATE_ALIGN4) { + if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } else { + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } if (!pskb_may_pull(skb, ah_hlen)) goto out; @@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x) BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ALIGN4) + x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + else + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7927589813b5..090d273d7865 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -433,14 +433,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { - struct flowi fl = { .fl4_dst = sip, - .fl4_src = tip }; struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); - if (ip_route_output_key(net, &rt, &fl) < 0) + rt = ip_route_output(net, sip, tip, 0, 0); + if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); @@ -1061,12 +1060,10 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) @@ -1177,7 +1174,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, static int arp_req_delete(struct net *net, struct arpreq *r, struct net_device *dev) { - int err; __be32 ip; if (r->arp_flags & ATF_PUBL) @@ -1185,12 +1181,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 174be6caa5c8..85bd24ca4f6d 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!saddr) saddr = inet->mc_addr; } - err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, - RT_CONN_FLAGS(sk), oif, - sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, 1); - if (err) { + rt = ip_route_connect(usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->inet_sport, usin->sin_port, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 036652c8166d..5345b0bee6df 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -51,6 +51,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> +#include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -63,6 +64,8 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include "fib_lookup.h" + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE + * value. So if you change this define, make appropriate changes to + * inet_addr_hash as well. + */ +#define IN4_ADDR_HSIZE 256 +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; +static DEFINE_SPINLOCK(inet_addr_hash_lock); + +static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +{ + u32 val = (__force u32) addr ^ hash_ptr(net, 8); + + return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & + (IN4_ADDR_HSIZE - 1)); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ + unsigned int hash = inet_addr_hash(net, ifa->ifa_local); + + spin_lock(&inet_addr_hash_lock); + hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + spin_unlock(&inet_addr_hash_lock); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ + spin_lock(&inet_addr_hash_lock); + hlist_del_init_rcu(&ifa->hash); + spin_unlock(&inet_addr_hash_lock); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ + unsigned int hash = inet_addr_hash(net, addr); + struct net_device *result = NULL; + struct in_ifaddr *ifa; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; + if (ifa->ifa_local == addr) { + result = dev; + break; + } + } + if (!result) { + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res = { 0 }; + struct fib_table *local; + + /* Fallback to FIB local table so that communication + * over loopback subnets work. + */ + local = fib_get_table(net, RT_TABLE_LOCAL); + if (local && + !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && + res.type == RTN_LOCAL) + result = FIB_RES_DEV(res); + } + if (result && devref) + dev_hold(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(__ip_dev_find); + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } if (!do_promote) { + inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); @@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } } + /* On promotion all secondaries from subnet are changing + * the primary IP, we must remove all their routes silently + * and later to add them back with new prefsrc. Do this + * while all addresses are on the device list. + */ + for (ifa = promote; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) + fib_del_ifaddr(ifa, ifa1); + } + /* 2. Unlink it */ *ifap = ifa1->ifa_next; + inet_hash_remove(ifa1); /* 3. Announce address deletion */ @@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { + struct in_ifaddr *next_sec = promote->ifa_next; if (prev_prom) { prev_prom->ifa_next = promote->ifa_next; @@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_next = *ifap; *ifap = ifa; + inet_hash_insert(dev_net(in_dev->dev), ifa); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ @@ -521,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] == NULL) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = ifm->ifa_flags; @@ -728,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); + INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; if (colon) @@ -1084,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct in_ifaddr *ifa = inet_alloc_ifa(); if (ifa) { + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1720,6 +1821,11 @@ static struct rtnl_af_ops inet_af_ops = { void __init devinet_init(void) { + int i; + + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet_addr_lst[i]); + register_pernet_subsys(&devinet_ops); register_gifconf(PF_INET, inet_gifconf); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e42a905180f0..03f994bcf7de 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); * * TODO: Use spare space in skb for this where possible. */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) { unsigned int len; - len = crypto_aead_ivsize(aead); + len = seqhilen; + + len += crypto_aead_ivsize(aead); + if (len) { len += crypto_aead_alignmask(aead) & ~(crypto_tfm_ctx_alignment() - 1); @@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) return kmalloc(len, GFP_ATOMIC); } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) { return crypto_aead_ivsize(aead) ? - PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int plen; int tfclen; int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; /* skb is pure payload to encrypt */ @@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) goto error; nfrags = err; - tmp = esp_alloc_tmp(aead, nfrags + 1); + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto error; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_givreq(aead, iv); asg = esp_givreq_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } esph->spi = x->id.spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, esph->enc_data + crypto_aead_ivsize(aead) - skb->data, clen + alen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_givcrypt_set_callback(req, 0, esp_output_done, skb); aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); + aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output); + XFRM_SKB_CB(skb)->seq.output.low); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); @@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; void *tmp; u8 *iv; struct scatterlist *sg; @@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) goto out; nfrags = err; + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + 1); + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto out; ESP_SKB_CB(skb)->tmp = tmp; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; @@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, sizeof(*esph)); + aead_request_set_assoc(req, asg, assoclen); err = crypto_aead_decrypt(req); if (err == -EINPROGRESS) @@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; err = -ENAMETOOLONG; - if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", - x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1d2cdd43a878..f116ce8f1b46 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_hash_table(RT_TABLE_LOCAL); + local_table = fib_trie_table(RT_TABLE_LOCAL); if (local_table == NULL) return -ENOMEM; - main_table = fib_hash_table(RT_TABLE_MAIN); + main_table = fib_trie_table(RT_TABLE_MAIN); if (main_table == NULL) goto fail; @@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_hash_table(id); + tb = fib_trie_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); @@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -void fib_select_default(struct net *net, - const struct flowi *flp, struct fib_result *res) -{ - struct fib_table *tb; - int table = RT_TABLE_MAIN; -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) - return; - table = res->r->table; -#endif - tb = fib_get_table(net, table); - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - fib_table_select_default(tb, flp, res); -} - static void fib_flush(struct net *net) { int flushed = 0; @@ -147,46 +132,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/** - * __ip_dev_find - find the first device with a given source address. - * @net: the net namespace - * @addr: the source address - * @devref: if true, take a reference on the found device - * - * If a caller uses devref=false, it should be protected by RCU, or RTNL - */ -struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) -{ - struct flowi fl = { - .fl4_dst = addr, - }; - struct fib_result res = { 0 }; - struct net_device *dev = NULL; - struct fib_table *local_table; - -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - - rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || - fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { - rcu_read_unlock(); - return NULL; - } - if (res.type != RTN_LOCAL) - goto out; - dev = FIB_RES_DEV(res); - - if (dev && devref) - dev_hold(dev); -out: - rcu_read_unlock(); - return dev; -} -EXPORT_SYMBOL(__ip_dev_find); - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - struct flowi fl = { .fl4_dst = addr }; + struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned ret = RTN_BROADCAST; struct fib_table *local_table; @@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, if (local_table) { ret = RTN_UNICAST; rcu_read_lock(); - if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, u32 *itag, u32 mark) { struct in_device *in_dev; - struct flowi fl = { - .fl4_dst = src, - .fl4_src = dst, - .fl4_tos = tos, - .mark = mark, - .iif = oif - }; + struct flowi4 fl4; struct fib_result res; int no_addr, rpf, accept_local; bool dev_match; int ret; struct net *net; + fl4.flowi4_oif = 0; + fl4.flowi4_iif = oif; + fl4.flowi4_mark = mark; + fl4.daddr = src; + fl4.saddr = dst; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + no_addr = rpf = accept_local = 0; in_dev = __in_dev_get_rcu(dev); if (in_dev) { @@ -268,20 +215,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, rpf = IN_DEV_RPFILTER(in_dev); accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); if (mark && !IN_DEV_SRC_VMARK(in_dev)) - fl.mark = 0; + fl4.flowi4_mark = 0; } if (in_dev == NULL) goto e_inval; net = dev_net(dev); - if (fib_lookup(net, &fl, &res)) + if (fib_lookup(net, &fl4, &res)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -306,12 +253,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, goto last_resort; if (rpf == 1) goto e_rpf; - fl.oif = dev->ifindex; + fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl, &res) == 0) { + if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } } @@ -775,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; - struct in_ifaddr *prim = ifa; + struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 @@ -788,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); - else { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; } /* Deletion is more complicated than add. @@ -808,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -816,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } } if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -849,11 +871,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) { struct fib_result res; - struct flowi fl = { - .mark = frn->fl_mark, - .fl4_dst = frn->fl_addr, - .fl4_tos = frn->fl_tos, - .fl4_scope = frn->fl_scope, + struct flowi4 fl4 = { + .flowi4_mark = frn->fl_mark, + .daddr = frn->fl_addr, + .flowi4_tos = frn->fl_tos, + .flowi4_scope = frn->fl_scope, }; #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -866,7 +888,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->tb_id = tb->tb_id; rcu_read_lock(); - frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); + frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; @@ -938,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: @@ -945,10 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); + fib_del_ifaddr(ifa, NULL); + atomic_inc(&net->ipv4.dev_addr_genid); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -966,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); @@ -983,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: @@ -1101,5 +1128,5 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_hash_init(); + fib_trie_init(); } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index b3acb0417b21..000000000000 --- a/net/ipv4/fib_hash.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 FIB: lookup engine and maintenance routines. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/ip_fib.h> - -#include "fib_lookup.h" - -static struct kmem_cache *fn_hash_kmem __read_mostly; -static struct kmem_cache *fn_alias_kmem __read_mostly; - -struct fib_node { - struct hlist_node fn_hash; - struct list_head fn_alias; - __be32 fn_key; - struct fib_alias fn_embedded_alias; -}; - -#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - -struct fn_zone { - struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head __rcu *fz_hash; /* Hash table pointer */ - seqlock_t fz_lock; - u32 fz_hashmask; /* (fz_divisor - 1) */ - - u8 fz_order; /* Zone order (0..32) */ - u8 fz_revorder; /* 32 - fz_order */ - __be32 fz_mask; /* inet_make_mask(order) */ -#define FZ_MASK(fz) ((fz)->fz_mask) - - struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; - - int fz_nent; /* Number of entries */ - int fz_divisor; /* Hash size (mask+1) */ -}; - -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone __rcu *fn_zone_list; -}; - -static inline u32 fn_hash(__be32 key, struct fn_zone *fz) -{ - u32 h = ntohl(key) >> fz->fz_revorder; - h ^= (h>>20); - h ^= (h>>10); - h ^= (h>>5); - h &= fz->fz_hashmask; - return h; -} - -static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) -{ - return dst & FZ_MASK(fz); -} - -static unsigned int fib_hash_genid; - -#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) - -static struct hlist_head *fz_hash_alloc(int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); - - return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); -} - -/* The fib hash lock must be held when this is called. */ -static inline void fn_rebuild_zone(struct fn_zone *fz, - struct hlist_head *old_ht, - int old_divisor) -{ - int i; - - for (i = 0; i < old_divisor; i++) { - struct hlist_node *node, *n; - struct fib_node *f; - - hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head *new_head; - - hlist_del_rcu(&f->fn_hash); - - new_head = rcu_dereference_protected(fz->fz_hash, 1) + - fn_hash(f->fn_key, fz); - hlist_add_head_rcu(&f->fn_hash, new_head); - } - } -} - -static void fz_hash_free(struct hlist_head *hash, int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - kfree(hash); - else - free_pages((unsigned long)hash, get_order(size)); -} - -static void fn_rehash_zone(struct fn_zone *fz) -{ - struct hlist_head *ht, *old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - new_divisor = old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case EMBEDDED_HASH_SIZE: - new_divisor *= EMBEDDED_HASH_SIZE; - break; - case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: - new_divisor *= (EMBEDDED_HASH_SIZE/2); - break; - default: - if ((old_divisor << 1) > FZ_MAX_DIVISOR) { - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } - new_divisor = (old_divisor << 1); - break; - } - - new_hashmask = (new_divisor - 1); - -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", - fz->fz_order, old_divisor); -#endif - - ht = fz_hash_alloc(new_divisor); - - if (ht) { - struct fn_zone nfz; - - memcpy(&nfz, fz, sizeof(nfz)); - - write_seqlock_bh(&fz->fz_lock); - old_ht = rcu_dereference_protected(fz->fz_hash, 1); - RCU_INIT_POINTER(nfz.fz_hash, ht); - nfz.fz_hashmask = new_hashmask; - nfz.fz_divisor = new_divisor; - fn_rebuild_zone(&nfz, old_ht, old_divisor); - fib_hash_genid++; - rcu_assign_pointer(fz->fz_hash, ht); - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - write_sequnlock_bh(&fz->fz_lock); - - if (old_ht != fz->fz_embedded_hash) { - synchronize_rcu(); - fz_hash_free(old_ht, old_divisor); - } - } -} - -static void fn_free_node_rcu(struct rcu_head *head) -{ - struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); - - kmem_cache_free(fn_hash_kmem, f); -} - -static inline void fn_free_node(struct fib_node *f) -{ - call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); -} - -static void fn_free_alias_rcu(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - - kmem_cache_free(fn_alias_kmem, fa); -} - -static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) -{ - fib_release_info(fa->fa_info); - if (fa == &f->fn_embedded_alias) - fa->fa_info = NULL; - else - call_rcu(&fa->rcu, fn_free_alias_rcu); -} - -static struct fn_zone * -fn_new_zone(struct fn_hash *table, int z) -{ - int i; - struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); - if (!fz) - return NULL; - - seqlock_init(&fz->fz_lock); - fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; - fz->fz_hashmask = fz->fz_divisor - 1; - RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); - fz->fz_order = z; - fz->fz_revorder = 32 - z; - fz->fz_mask = inet_make_mask(z); - - /* Find the first not empty zone with more specific mask */ - for (i = z + 1; i <= 32; i++) - if (table->fn_zones[i]) - break; - if (i > 32) { - /* No more specific masks, we are the first. */ - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zone_list)); - rcu_assign_pointer(table->fn_zone_list, fz); - } else { - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zones[i]->fz_next)); - rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); - } - table->fn_zones[z] = fz; - fib_hash_genid++; - return fz; -} - -int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res, - int fib_flags) -{ - int err; - struct fn_zone *fz; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - - rcu_read_lock(); - for (fz = rcu_dereference(t->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; - struct hlist_node *node; - struct fib_node *f; - __be32 k; - unsigned int seq; - - do { - seq = read_seqbegin(&fz->fz_lock); - k = fz_key(flp->fl4_dst, fz); - - head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; - - err = fib_semantic_match(&f->fn_alias, - flp, res, - fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } - } while (read_seqretry(&fz->fz_lock, seq)); - } - err = 1; -out: - rcu_read_unlock(); - return err; -} - -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) -{ - int order, last_idx; - struct hlist_node *node; - struct fib_node *f; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; - struct hlist_head *head; - - if (fz == NULL) - return; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - head = rcu_dereference(fz->fz_hash); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - } - - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - -/* Insert node F to FZ. */ -static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - - hlist_add_head_rcu(&f->fn_hash, head); -} - -/* Return the node in FZ matching KEY. */ -static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); - struct hlist_node *node; - struct fib_node *f; - - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key == key) - return f; - } - - return NULL; -} - - -static struct fib_alias *fib_fast_alloc(struct fib_node *f) -{ - struct fib_alias *fa = &f->fn_embedded_alias; - - if (fa->fa_info != NULL) - fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - return fa; -} - -/* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fib_node *new_f = NULL; - struct fib_node *f; - struct fib_alias *fa, *new_fa; - struct fn_zone *fz; - struct fib_info *fi; - u8 tos = cfg->fc_tos; - __be32 key; - int err; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - fz = table->fn_zones[cfg->fc_dst_len]; - if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) - return -ENOBUFS; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - fi = fib_create_info(cfg); - if (IS_ERR(fi)) - return PTR_ERR(fi); - - if (fz->fz_nent > (fz->fz_divisor<<1) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (cfg->fc_dst_len == 32 || - (1 << cfg->fc_dst_len) > fz->fz_divisor)) - fn_rehash_zone(fz); - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); - - /* Now fa, if non-NULL, points to the first fib alias - * with the same keys [prefix,tos,priority], if such key already - * exists or to the node before which we will insert new one. - * - * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. - */ - - if (fa && fa->fa_tos == tos && - fa->fa_info->fib_priority == fi->fib_priority) { - struct fib_alias *fa_first, *fa_match; - - err = -EEXIST; - if (cfg->fc_nlflags & NLM_F_EXCL) - goto out; - - /* We have 2 goals: - * 1. Find exact match for type, scope, fib_info to avoid - * duplicate routes - * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it - */ - fa_match = NULL; - fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - if (fa->fa_tos != tos) - break; - if (fa->fa_info->fib_priority != fi->fib_priority) - break; - if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { - fa_match = fa; - break; - } - } - - if (cfg->fc_nlflags & NLM_F_REPLACE) { - u8 state; - - fa = fa_first; - if (fa_match) { - if (fa == fa_match) - err = 0; - goto out; - } - err = -ENOBUFS; - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_tos = fa->fa_tos; - new_fa->fa_info = fi; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; - new_fa->fa_state = state & ~FA_S_ACCESSED; - fib_hash_genid++; - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - - fn_free_alias(fa, f); - if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); - return 0; - } - - /* Error if we find a perfect match which - * uses the same scope, type, and nexthop - * information. - */ - if (fa_match) - goto out; - - if (!(cfg->fc_nlflags & NLM_F_APPEND)) - fa = fa_first; - } - - err = -ENOENT; - if (!(cfg->fc_nlflags & NLM_F_CREATE)) - goto out; - - err = -ENOBUFS; - - if (!f) { - new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); - if (new_f == NULL) - goto out; - - INIT_HLIST_NODE(&new_f->fn_hash); - INIT_LIST_HEAD(&new_f->fn_alias); - new_f->fn_key = key; - f = new_f; - } - - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_info = fi; - new_fa->fa_tos = tos; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; - - /* - * Insert new entry to the list. - */ - - if (new_f) - fib_insert_node(fz, new_f); - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : &f->fn_alias)); - fib_hash_genid++; - - if (new_f) - fz->fz_nent++; - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, 0); - return 0; - -out: - if (new_f) - kmem_cache_free(fn_hash_kmem, new_f); - fib_release_info(fi); - return err; -} - -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - struct fib_node *f; - struct fib_alias *fa, *fa_to_delete; - struct fn_zone *fz; - __be32 key; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL) - return -ESRCH; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); - if (!fa) - return -ESRCH; - - fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fa->fa_tos != cfg->fc_tos) - break; - - if ((!cfg->fc_type || - fa->fa_type == cfg->fc_type) && - (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && - (!cfg->fc_protocol || - fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { - fa_to_delete = fa; - break; - } - } - - if (fa_to_delete) { - int kill_fn; - - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, 0); - - kill_fn = 0; - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_fn = 1; - } - fib_hash_genid++; - - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - fn_free_alias(fa, f); - if (kill_fn) { - fn_free_node(f); - fz->fz_nent--; - } - - return 0; - } - return -ESRCH; -} - -static int fn_flush_list(struct fn_zone *fz, int idx) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; - struct hlist_node *node, *n; - struct fib_node *f; - int found = 0; - - hlist_for_each_entry_safe(f, node, n, head, fn_hash) { - struct fib_alias *fa, *fa_node; - int kill_f; - - kill_f = 0; - list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_f = 1; - } - fib_hash_genid++; - - fn_free_alias(fa, f); - found++; - } - } - if (kill_f) { - fn_free_node(f); - fz->fz_nent--; - } - } - return found; -} - -/* caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz; - int found = 0; - - for (fz = rtnl_dereference(table->fn_zone_list); - fz != NULL; - fz = rtnl_dereference(fz->fz_next)) { - int i; - - for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); - } - return found; -} - -void fib_free_table(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz, *next; - - next = table->fn_zone_list; - while (next != NULL) { - fz = next; - next = fz->fz_next; - - if (fz->fz_hash != fz->fz_embedded_hash) - fz_hash_free(fz->fz_hash, fz->fz_divisor); - - kfree(fz); - } - - kfree(tb); -} - -static inline int -fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz, - struct hlist_head *head) -{ - struct hlist_node *node; - struct fib_node *f; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - if (i < s_i) - goto next; - - if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - fa->fa_scope, - f->fn_key, - fz->fz_order, - fa->fa_tos, - fa->fa_info, - NLM_F_MULTI) < 0) { - cb->args[4] = i; - return -1; - } -next: - i++; - } - } - cb->args[4] = i; - return skb->len; -} - -static inline int -fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz) -{ - int h, s_h; - struct hlist_head *head = rcu_dereference(fz->fz_hash); - - if (head == NULL) - return skb->len; - s_h = cb->args[3]; - for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(head + h)) - continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { - cb->args[3] = h; - return -1; - } - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - } - cb->args[3] = h; - return skb->len; -} - -int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) -{ - int m = 0, s_m; - struct fn_zone *fz; - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - - s_m = cb->args[2]; - rcu_read_lock(); - for (fz = rcu_dereference(table->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next), m++) { - if (m < s_m) - continue; - if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { - cb->args[2] = m; - rcu_read_unlock(); - return -1; - } - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); - } - rcu_read_unlock(); - cb->args[2] = m; - return skb->len; -} - -void __init fib_hash_init(void) -{ - fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); - - fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); - -} - -struct fib_table *fib_hash_table(u32 id) -{ - struct fib_table *tb; - - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), - GFP_KERNEL); - if (tb == NULL) - return NULL; - - tb->tb_id = id; - tb->tb_default = -1; - - memset(tb->tb_data, 0, sizeof(struct fn_hash)); - return tb; -} - -/* ------------------------------------------------------------------------ */ -#ifdef CONFIG_PROC_FS - -struct fib_iter_state { - struct seq_net_private p; - struct fn_zone *zone; - int bucket; - struct hlist_head *hash_head; - struct fib_node *fn; - struct fib_alias *fa; - loff_t pos; - unsigned int genid; - int valid; -}; - -static struct fib_alias *fib_get_first(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_table *main_table; - struct fn_hash *table; - - main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); - table = (struct fn_hash *)main_table->tb_data; - - iter->bucket = 0; - iter->hash_head = NULL; - iter->fn = NULL; - iter->fa = NULL; - iter->pos = 0; - iter->genid = fib_hash_genid; - iter->valid = 1; - - for (iter->zone = rcu_dereference(table->fn_zone_list); - iter->zone != NULL; - iter->zone = rcu_dereference(iter->zone->fz_next)) { - int maxslot; - - if (!iter->zone->fz_nent) - continue; - - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - maxslot = iter->zone->fz_divisor; - - for (iter->bucket = 0; iter->bucket < maxslot; - ++iter->bucket, ++iter->hash_head) { - struct hlist_node *node; - struct fib_node *fn; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - } -out: - return iter->fa; -} - -static struct fib_alias *fib_get_next(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_node *fn; - struct fib_alias *fa; - - /* Advance FA, if any. */ - fn = iter->fn; - fa = iter->fa; - if (fa) { - BUG_ON(!fn); - list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - - fa = iter->fa = NULL; - - /* Advance FN. */ - if (fn) { - struct hlist_node *node = &fn->fn_hash; - hlist_for_each_entry_continue(fn, node, fn_hash) { - iter->fn = fn; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - } - - fn = iter->fn = NULL; - - /* Advance hash chain. */ - if (!iter->zone) - goto out; - - for (;;) { - struct hlist_node *node; - int maxslot; - - maxslot = iter->zone->fz_divisor; - - while (++iter->bucket < maxslot) { - iter->hash_head++; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - - iter->zone = rcu_dereference(iter->zone->fz_next); - - if (!iter->zone) - goto out; - - iter->bucket = 0; - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } -out: - iter->pos++; - return fa; -} - -static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -{ - struct fib_iter_state *iter = seq->private; - struct fib_alias *fa; - - if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { - fa = iter->fa; - pos -= iter->pos; - } else - fa = fib_get_first(seq); - - if (fa) - while (pos && (fa = fib_get_next(seq))) - --pos; - return pos ? NULL : fa; -} - -static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - void *v = NULL; - - rcu_read_lock(); - if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) - v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; -} - -static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); -} - -static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -{ - static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, - [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; - - if (fi && fi->fib_nh->nh_gw) - flags |= RTF_GATEWAY; - if (mask == htonl(0xFFFFFFFF)) - flags |= RTF_HOST; - flags |= RTF_UP; - return flags; -} - -/* - * This outputs /proc/net/route. - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ -static int fib_seq_show(struct seq_file *seq, void *v) -{ - struct fib_iter_state *iter; - int len; - __be32 prefix, mask; - unsigned flags; - struct fib_node *f; - struct fib_alias *fa; - struct fib_info *fi; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " - "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" - "\tWindow\tIRTT"); - goto out; - } - - iter = seq->private; - f = iter->fn; - fa = iter->fa; - fi = fa->fa_info; - prefix = f->fn_key; - mask = FZ_MASK(iter->zone); - flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, - mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3, &len); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); -out: - return 0; -} - -static const struct seq_operations fib_seq_ops = { - .start = fib_seq_start, - .next = fib_seq_next, - .stop = fib_seq_stop, - .show = fib_seq_show, -}; - -static int fib_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_seq_ops, - sizeof(struct fib_iter_state)); -} - -static const struct file_operations fib_seq_fops = { - .owner = THIS_MODULE, - .open = fib_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -int __net_init fib_proc_init(struct net *net) -{ - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) - return -ENOMEM; - return 0; -} - -void __net_exit fib_proc_exit(struct net *net) -{ - proc_net_remove(net, "route"); -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c079cc0ec651..af0f14aba169 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias { struct fib_info *fa_info; u8 fa_tos; u8 fa_type; - u8 fa_scope; u8 fa_state; struct rcu_head rcu; }; @@ -25,14 +24,11 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern int fib_semantic_match(struct list_head *head, - const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, @@ -51,4 +47,11 @@ static inline void fib_result_assign(struct fib_result *res, res->fi = fi; } +struct fib_prop { + int error; + u8 scope; +}; + +extern const struct fib_prop fib_props[RTN_MAX + 1]; + #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7b..a53bb1b5b118 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,19 +41,19 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE -u32 fib_rules_tclass(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_CLASSID +u32 fib_rules_tclass(const struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; } #endif -int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) +int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) }; int err; - err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); res->r = arg.rule; return err; @@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if (!tbl) goto errout; - err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); + err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); if (err > 0) err = -EAGAIN; errout: @@ -106,14 +106,15 @@ errout: static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; - __be32 daddr = fl->fl4_dst; - __be32 saddr = fl->fl4_src; + struct flowi4 *fl4 = &fl->u.ip4; + __be32 daddr = fl4->daddr; + __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; - if (r->tos && (r->tos != fl->fl4_tos)) + if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; return 1; @@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3df1b7..641a5a2a9f9c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -49,7 +49,7 @@ static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_hash_size; +static unsigned int fib_info_hash_size; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 @@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define endfor_nexthops(fi) } -static const struct -{ - int error; - u8 scope; -} fib_props[RTN_MAX + 1] = { +const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, @@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + if (fi->fib_metrics != (u32 *) dst_default_metrics) + kfree(fi->fib_metrics); kfree(fi); } @@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -221,10 +219,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; - val ^= fi->fib_protocol; + val ^= (fi->fib_protocol << 8) | fi->fib_scope; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { @@ -250,10 +248,11 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, - sizeof(fi->fib_metrics)) == 0 && + sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; @@ -330,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, goto errout; err = fib_dump_info(skb, info->pid, seq, event, tb_id, - fa->fa_type, fa->fa_scope, key, dst_len, + fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -422,7 +421,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +475,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -562,16 +561,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { - struct flowi fl = { - .fl4_dst = nh->nh_gw, - .fl4_scope = cfg->fc_scope + 1, - .oif = nh->nh_oif, + struct flowi4 fl4 = { + .daddr = nh->nh_gw, + .flowi4_scope = cfg->fc_scope + 1, + .flowi4_oif = nh->nh_oif, }; /* It is not necessary, but requires a bit of thinking */ - if (fl.fl4_scope < RT_SCOPE_LINK) - fl.fl4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl, &res); + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + err = fib_lookup(net, &fl4, &res); if (err) { rcu_read_unlock(); return err; @@ -613,14 +612,14 @@ out: static inline unsigned int fib_laddr_hashfn(__be32 val) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; } -static struct hlist_head *fib_hash_alloc(int bytes) +static struct hlist_head *fib_info_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) return kzalloc(bytes, GFP_KERNEL); @@ -630,7 +629,7 @@ static struct hlist_head *fib_hash_alloc(int bytes) get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +static void fib_info_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -641,18 +640,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes) free_pages((unsigned long) hash, get_order(bytes)); } -static void fib_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) { struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_hash_size; + unsigned int old_size = fib_info_hash_size; unsigned int i, bytes; spin_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; - fib_hash_size = new_size; + fib_info_hash_size = new_size; for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; @@ -693,8 +692,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash, spin_unlock_bh(&fib_info_lock); bytes = old_size * sizeof(struct hlist_head *); - fib_hash_free(old_info_hash, bytes); - fib_hash_free(old_laddrhash, bytes); + fib_info_hash_free(old_info_hash, bytes); + fib_info_hash_free(old_laddrhash, bytes); +} + +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_parent->fib_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; } struct fib_info *fib_create_info(struct fib_config *cfg) @@ -705,6 +714,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + if (cfg->fc_type > RTN_MAX) + goto err_inval; + /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; @@ -718,8 +730,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #endif err = -ENOBUFS; - if (fib_info_cnt >= fib_hash_size) { - unsigned int new_size = fib_hash_size << 1; + if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; unsigned int bytes; @@ -727,25 +739,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_size) new_size = 1; bytes = new_size * sizeof(struct hlist_head *); - new_info_hash = fib_hash_alloc(bytes); - new_laddrhash = fib_hash_alloc(bytes); + new_info_hash = fib_info_hash_alloc(bytes); + new_laddrhash = fib_info_hash_alloc(bytes); if (!new_info_hash || !new_laddrhash) { - fib_hash_free(new_info_hash, bytes); - fib_hash_free(new_laddrhash, bytes); + fib_info_hash_free(new_info_hash, bytes); + fib_info_hash_free(new_laddrhash, bytes); } else - fib_hash_move(new_info_hash, new_laddrhash, new_size); + fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - if (!fib_hash_size) + if (!fib_info_hash_size) goto failure; } fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + if (cfg->fc_mx) { + fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!fi->fib_metrics) + goto failure; + } else + fi->fib_metrics = (u32 *) dst_default_metrics; fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; @@ -779,7 +798,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -792,7 +811,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -804,6 +823,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) goto err_inval; goto link_it; + } else { + switch (cfg->fc_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + break; + default: + goto err_inval; + } } if (cfg->fc_scope > RT_SCOPE_HOST) @@ -835,6 +865,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; } + change_nexthops(fi) { + fib_info_update_nh_saddr(net, nexthop_nh); + } endfor_nexthops(fi) + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -880,86 +914,8 @@ failure: return ERR_PTR(err); } -/* Note! fib_semantic_match intentionally uses RCU list functions. */ -int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags) -{ - struct fib_alias *fa; - int nh_sel = 0; - - list_for_each_entry_rcu(fa, head, fa_list) { - int err; - - if (fa->fa_tos && - fa->fa_tos != flp->fl4_tos) - continue; - - if (fa->fa_scope < flp->fl4_scope) - continue; - - fib_alias_accessed(fa); - - err = fib_props[fa->fa_type].error; - if (err == 0) { - struct fib_info *fi = fa->fa_info; - - if (fi->fib_flags & RTNH_F_DEAD) - continue; - - switch (fa->fa_type) { - case RTN_UNICAST: - case RTN_LOCAL: - case RTN_BROADCAST: - case RTN_ANYCAST: - case RTN_MULTICAST: - for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (!flp->oif || flp->oif == nh->nh_oif) - break; - } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (nhsel < fi->fib_nhs) { - nh_sel = nhsel; - goto out_fill_res; - } -#else - if (nhsel < 1) - goto out_fill_res; -#endif - endfor_nexthops(fi); - continue; - - default: - pr_warning("fib_semantic_match bad type %#x\n", - fa->fa_type); - return -EINVAL; - } - } - return err; - } - return 1; - -out_fill_res: - res->prefixlen = prefixlen; - res->nh_sel = nh_sel; - res->type = fa->fa_type; - res->scope = fa->fa_scope; - res->fi = fa->fa_info; - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&res->fi->fib_clntref); - return 0; -} - -/* Find appropriate source address to this destination */ - -__be32 __fib_res_prefsrc(struct fib_result *res) -{ - return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); -} - int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; @@ -981,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; + rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len) @@ -1002,7 +958,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1027,7 +983,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif @@ -1125,6 +1081,62 @@ int fib_sync_down_dev(struct net_device *dev, int force) return ret; } +/* Must be invoked inside of an RCU protected region. */ +void fib_select_default(struct fib_result *res) +{ + struct fib_info *fi = NULL, *last_resort = NULL; + struct list_head *fa_head = res->fa_head; + struct fib_table *tb = res->table; + int order = -1, last_idx = -1; + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (next_fi->fib_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + + fib_alias_accessed(fa); + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + fi = next_fi; + order++; + } + + if (order <= 0 || fi == NULL) { + tb->tb_default = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: + return; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH /* @@ -1189,7 +1201,7 @@ int fib_sync_up(struct net_device *dev) * The algorithm is suboptimal, but it provides really * fair weighted route distribution. */ -void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; int w; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0f280348e0fd..b92c86f6e9b3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -95,7 +95,7 @@ typedef unsigned int t_key; #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) -struct node { +struct rt_trie_node { unsigned long parent; t_key key; }; @@ -126,7 +126,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct node *child[0]; + struct rt_trie_node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,16 +151,16 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct rt_trie_node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); -static struct node *resize(struct trie *t, struct tnode *tn); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ @@ -177,12 +177,12 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct node *node) +static inline struct tnode *node_parent(struct rt_trie_node *node) { return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct node *node) +static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) { struct tnode *ret = node_parent(node); @@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node) /* Same as rcu_assign_pointer * but that macro() assumes that value is a pointer. */ -static inline void node_set_parent(struct node *node, struct tnode *ptr) +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) { smp_wmb(); node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); return tn->child[i]; } -static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { - struct node *ret = tnode_get_child(tn, i); + struct rt_trie_node *ret = tnode_get_child(tn, i); return rcu_dereference_rtnl(ret); } @@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } -static inline t_key mask_pfx(t_key k, unsigned short l) +static inline t_key mask_pfx(t_key k, unsigned int l) { return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); } -static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) { if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); @@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); size_t size = sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn) tn->tnode_free = tnode_free_head; tnode_free_head = tn; tnode_free_size += sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); } static void tnode_free_flush(void) @@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); + size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { @@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct node) << bits); + sizeof(struct rt_trie_node) << bits); return tn; } @@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) { if (n == NULL || IS_LEAF(n)) return 0; @@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) } static inline void put_child(struct trie *t, struct tnode *tn, int i, - struct node *n) + struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct node *chi = tn->child[i]; + struct rt_trie_node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); @@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, } #define MAX_WORK 10 -static struct node *resize(struct trie *t, struct tnode *tn) +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) { int i; struct tnode *old_tn; @@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node *)tn)) { + if (!node_parent((struct rt_trie_node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; } else { @@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return (struct node *) tn; + return (struct rt_trie_node *) tn; /* * Halve as long as the number of empty children in this @@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) { one_child: for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; + struct rt_trie_node *n; n = tn->child[i]; if (!n) @@ -676,7 +676,7 @@ one_child: return n; } } - return (struct node *) tn; + return (struct rt_trie_node *) tn; } static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct node *) left); - put_child(t, tn, 2*i+1, (struct node *) right); + put_child(t, tn, 2*i, (struct rt_trie_node *) left); + put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); } } for (i = 0; i < olen; i++) { struct tnode *inode; - struct node *node = tnode_get_child(oldtnode, i); + struct rt_trie_node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -825,7 +825,7 @@ nomem: static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; - struct node *left, *right; + struct rt_trie_node *left, *right; int i; int olen = tnode_child_length(tn); @@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct node *)newn); + put_child(t, tn, i/2, (struct rt_trie_node *)newn); } } @@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key) { int pos; struct tnode *tn; - struct node *n; + struct rt_trie_node *n; pos = 0; n = rcu_dereference_rtnl(t->trie); @@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; - while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { + while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize(t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex, - (struct node *)tn, wasfull); + (struct rt_trie_node *)tn, wasfull); - tp = node_parent((struct node *) tn); + tp = node_parent((struct rt_trie_node *) tn); if (!tp) - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); if (!tp) @@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); } @@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; - struct node *n; + struct rt_trie_node *n; struct leaf *l; int missbit; struct list_head *fa_head = NULL; @@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - node_set_parent((struct node *)l, tp); + node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - node_set_parent((struct node *)tn, tp); + node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, missbit, (struct rt_trie_node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, - (struct node *)tn); + (struct rt_trie_node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; } } @@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && fa->fa_info == fi) { fa_match = fa; break; @@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; /* * Insert new entry to the list. @@ -1340,8 +1337,8 @@ err: } /* should be called with rcu_read_lock */ -static int check_leaf(struct trie *t, struct leaf *l, - t_key key, const struct flowi *flp, +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, + t_key key, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct leaf_info *li; @@ -1349,40 +1346,75 @@ static int check_leaf(struct trie *t, struct leaf *l, struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - int err; + struct fib_alias *fa; int plen = li->plen; __be32 mask = inet_make_mask(plen); if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - if (err <= 0) - t->stats.semantic_match_passed++; - else - t->stats.semantic_match_miss++; + t->stats.semantic_match_passed++; +#endif + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + res->prefixlen = plen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fa->fa_info->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &li->falh; + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); + return 0; + } + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_miss++; #endif - if (err <= 0) - return err; } return 1; } -int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, +int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; int ret; - struct node *n; + struct rt_trie_node *n; struct tnode *pn; - int pos, bits; - t_key key = ntohl(flp->fl4_dst); - int chopped_off; + unsigned int pos, bits; + t_key key = ntohl(flp->daddr); + unsigned int chopped_off; t_key cindex = 0; - int current_prefix_length = KEYLENGTH; + unsigned int current_prefix_length = KEYLENGTH; struct tnode *cn; t_key pref_mismatch; @@ -1398,7 +1430,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1423,7 +1455,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; @@ -1541,7 +1573,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent_rcu((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); if (!parent) goto failed; @@ -1568,7 +1600,7 @@ found: */ static void trie_leaf_remove(struct trie *t, struct leaf *l) { - struct tnode *tp = node_parent((struct node *) l); + struct tnode *tp = node_parent((struct rt_trie_node *) l); pr_debug("entering trie_leaf_remove(%p)\n", l); @@ -1629,7 +1661,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && + fa->fa_info->fib_scope == cfg->fc_scope) && + (!cfg->fc_prefsrc || + fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(cfg, fi) == 0) { @@ -1706,7 +1740,7 @@ static int trie_flush_leaf(struct leaf *l) * Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) { do { t_key idx; @@ -1732,7 +1766,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) } /* Node empty, walk back up to parent */ - c = (struct node *) p; + c = (struct rt_trie_node *) p; } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ @@ -1753,7 +1787,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { - struct node *c = (struct node *) l; + struct rt_trie_node *c = (struct rt_trie_node *) l; struct tnode *p = node_parent_rcu(c); if (!p) @@ -1802,80 +1836,6 @@ void fib_free_table(struct fib_table *tb) kfree(tb); } -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) -{ - struct trie *t = (struct trie *) tb->tb_data; - int order, last_idx; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fib_alias *fa = NULL; - struct list_head *fa_head; - struct leaf *l; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - - l = fib_find_node(t, 0); - if (!l) - goto out; - - fa_head = get_fa_head(l, 0); - if (!fa_head) - goto out; - - if (list_empty(fa_head)) - goto out; - - list_for_each_entry_rcu(fa, fa_head, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) @@ -1900,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, RTM_NEWROUTE, tb->tb_id, fa->fa_type, - fa->fa_scope, xkey, plen, fa->fa_tos, @@ -1990,7 +1949,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, return skb->len; } -void __init fib_hash_init(void) +void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), @@ -2003,8 +1962,7 @@ void __init fib_hash_init(void) } -/* Fix more generic FIB names for init later */ -struct fib_table *fib_hash_table(u32 id) +struct fib_table *fib_trie_table(u32 id) { struct fib_table *tb; struct trie *t; @@ -2036,7 +1994,7 @@ struct fib_trie_iter { unsigned int depth; }; -static struct node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; unsigned int cindex = iter->index; @@ -2050,7 +2008,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child_rcu(tn, cindex); + struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2069,7 +2027,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent_rcu((struct node *)tn); + p = node_parent_rcu((struct rt_trie_node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2081,10 +2039,10 @@ rescan: return NULL; } -static struct node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct node *n; + struct rt_trie_node *n; if (!t) return NULL; @@ -2108,7 +2066,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct node *n; + struct rt_trie_node *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -2181,7 +2139,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct node *) * pointers; + bytes += sizeof(struct rt_trie_node *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2262,7 +2220,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2275,7 +2233,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { - struct node *n; + struct rt_trie_node *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2304,7 +2262,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct node *n; + struct rt_trie_node *n; ++*pos; /* next node in same table */ @@ -2390,7 +2348,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct node *n = v; + struct rt_trie_node *n = v; if (!node_parent_rcu(n)) fib_table_print(seq, iter->tb); @@ -2422,7 +2380,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth+1); seq_printf(seq, " /%d %s %s", li->plen, rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), + fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_tos) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4aa1b7f01ea0..a91dc1611081 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk) * Send an ICMP frame. */ -/* - * Check transmit rate limitation for given message. - * The rate information is held in the destination cache now. - * This function is generic and could be used for other purposes - * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - * Note that the same dst_entry fields are modified by functions in - * route.c too, but these work for packet destinations while xrlim_allow - * works for icmp destinations. This means the rate limiting information - * for one "ip object" is shared - and these ICMPs are twice limited: - * by source and by destination. - * - * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits - * - * Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ - unsigned long now, token = dst->rate_tokens; - int rc = 0; - - now = jiffies; - token += now - dst->rate_last; - dst->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; - if (token >= timeout) { - token -= timeout; - rc = 1; - } - dst->rate_tokens = token; - return rc; -} -EXPORT_SYMBOL(xrlim_allow); - -static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { struct dst_entry *dst = &rt->dst; - int rc = 1; + bool rc = true; if (type > NR_ICMP_TYPES) goto out; @@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) - rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + if (!rt->peer) + rt_bind_peer(rt, 1); + rc = inet_peer_xrlim_allow(rt->peer, + net->ipv4.sysctl_icmp_ratelimit); + } out: return rc; } @@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) daddr = icmp_param->replyopts.faddr; } { - struct flowi fl = { .fl4_dst= daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .proto = IPPROTO_ICMP }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(net, &rt, &fl)) + struct flowi4 fl4 = { + .daddr = daddr, + .saddr = rt->rt_spec_dst, + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_proto = IPPROTO_ICMP, + }; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) goto out_unlock; } if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, @@ -402,6 +372,97 @@ out_unlock: icmp_xmit_unlock(sk); } +static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, + struct iphdr *iph, + __be32 saddr, u8 tos, + int type, int code, + struct icmp_bxm *param) +{ + struct flowi4 fl4 = { + .daddr = (param->replyopts.srr ? + param->replyopts.faddr : iph->saddr), + .saddr = saddr, + .flowi4_tos = RT_TOS(tos), + .flowi4_proto = IPPROTO_ICMP, + .fl4_icmp_type = type, + .fl4_icmp_code = code, + }; + struct rtable *rt, *rt2; + int err; + + security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4)); + rt = __ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return rt; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + if (!fl4.saddr) + fl4.saddr = rt->rt_src; + + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(&fl4), NULL, 0); + if (!IS_ERR(rt)) { + if (rt != rt2) + return rt; + } else if (PTR_ERR(rt) == -EPERM) { + rt = NULL; + } else + return rt; + + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET); + if (err) + goto relookup_failed; + + if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, &fl4); + if (IS_ERR(rt2)) + err = PTR_ERR(rt2); + } else { + struct flowi4 fl4_2 = {}; + unsigned long orefdst; + + fl4_2.daddr = fl4.saddr; + rt2 = ip_route_output_key(net, &fl4_2); + if (IS_ERR(rt2)) { + err = PTR_ERR(rt2); + goto relookup_failed; + } + /* Ugh! */ + orefdst = skb_in->_skb_refdst; /* save old refdst */ + err = ip_route_input(skb_in, fl4.daddr, fl4.saddr, + RT_TOS(tos), rt2->dst.dev); + + dst_release(&rt2->dst); + rt2 = skb_rtable(skb_in); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ + } + + if (err) + goto relookup_failed; + + rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, + flowi4_to_flowi(&fl4), NULL, + XFRM_LOOKUP_ICMP); + if (!IS_ERR(rt2)) { + dst_release(&rt->dst); + rt = rt2; + } else if (PTR_ERR(rt2) == -EPERM) { + if (rt) + dst_release(&rt->dst); + return rt2; + } else { + err = PTR_ERR(rt2); + goto relookup_failed; + } + return rt; + +relookup_failed: + if (rt) + return rt; + return ERR_PTR(err); +} /* * Send an ICMP message in response to a situation @@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->rt_iif); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.opt = &icmp_param.replyopts; ipc.tx_flags = 0; - { - struct flowi fl = { - .fl4_dst = icmp_param.replyopts.srr ? - icmp_param.replyopts.faddr : iph->saddr, - .fl4_src = saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_ICMP, - .fl_icmp_type = type, - .fl_icmp_code = code, - }; - int err; - struct rtable *rt2; - - security_skb_classify_flow(skb_in, &fl); - if (__ip_route_output_key(net, &rt, &fl)) - goto out_unlock; - - /* No need to clone since we're just using its address. */ - rt2 = rt; - - if (!fl.nl_u.ip4_u.saddr) - fl.nl_u.ip4_u.saddr = rt->rt_src; - - err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); - switch (err) { - case 0: - if (rt != rt2) - goto route_done; - break; - case -EPERM: - rt = NULL; - break; - default: - goto out_unlock; - } - - if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) - goto relookup_failed; - - if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) - err = __ip_route_output_key(net, &rt2, &fl); - else { - struct flowi fl2 = {}; - unsigned long orefdst; - - fl2.fl4_dst = fl.fl4_src; - if (ip_route_output_key(net, &rt2, &fl2)) - goto relookup_failed; - - /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->dst.dev); - - dst_release(&rt2->dst); - rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ - } - - if (err) - goto relookup_failed; - - err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, - XFRM_LOOKUP_ICMP); - switch (err) { - case 0: - dst_release(&rt->dst); - rt = rt2; - break; - case -EPERM: - goto ende; - default: -relookup_failed: - if (!rt) - goto out_unlock; - break; - } - } + rt = icmp_route_lookup(net, skb_in, iph, saddr, tos, + type, code, &icmp_param); + if (IS_ERR(rt)) + goto out_unlock; -route_done: if (!icmpv4_xrlim_allow(net, rt, type, code)) goto ende; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e0e77e297de3..1fd3d9ce8398 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) } igmp_skb_size(skb) = size; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = IGMPV3_ALL_MCR, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) { - kfree_skb(skb); - return NULL; - } + rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) { + kfree_skb(skb); + return NULL; } if (rt->rt_src == 0) { kfree_skb(skb); @@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, else dst = group; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = dst, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) - return -1; - } + rt = ip_route_output_ports(net, NULL, dst, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) + return -1; + if (rt->rt_src == 0) { ip_rt_put(rt); return -1; @@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { - struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr }; - struct rtable *rt; struct net_device *dev = NULL; struct in_device *idev = NULL; @@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) return NULL; } - if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->dst.dev; - ip_rt_put(rt); + if (!dev) { + struct rtable *rt = ip_route_output(net, + imr->imr_multiaddr.s_addr, + 0, 0, 0); + if (!IS_ERR(rt)) { + dev = rt->dst.dev; + ip_rt_put(rt); + } } if (dev) { imr->imr_ifindex = dev->ifindex; @@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_unlock(); } -int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +/* called with rcu_read_lock() */ +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) { struct ip_mc_list *im; struct ip_sf_list *psf; int rv = 0; - rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; @@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p } else rv = 1; /* unspecified source; tentatively allow */ } - rcu_read_unlock(); return rv; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 97e5fb765265..6c0b7f4a3d7d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -356,20 +356,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet_sk(sk)->inet_sport, - .fl_ip_dport = ireq->rmt_port }; + struct flowi4 fl4 = { + .flowi4_oif = sk->sk_bound_dev_if, + .flowi4_mark = sk->sk_mark, + .daddr = ((opt && opt->srr) ? + opt->faddr : ireq->rmt_addr), + .saddr = ireq->loc_addr, + .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = inet_sk_flowi_flags(sk), + .fl4_sport = inet_sk(sk)->inet_sport, + .fl4_dport = ireq->rmt_port, + }; struct net *net = sock_net(sk); - security_req_classify_flow(req, &fl); - if (ip_route_output_flow(net, &rt, &fl, sk, 0)) + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) goto no_route; if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto route_err; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index a96e65674ac3..dd1b20eca1a2 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = { struct inet_peer_base { struct inet_peer __rcu *root; - spinlock_t lock; + seqlock_t lock; int total; }; static struct inet_peer_base v4_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), .total = 0, }; static struct inet_peer_base v6_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), .total = 0, }; @@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a, int i, n = (a->family == AF_INET ? 1 : 4); for (i = 0; i < n; i++) { - if (a->a6[i] == b->a6[i]) + if (a->addr.a6[i] == b->addr.a6[i]) continue; - if (a->a6[i] < b->a6[i]) + if (a->addr.a6[i] < b->addr.a6[i]) return -1; return 1; } @@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a, return 0; } +#define rcu_deref_locked(X, BASE) \ + rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) + /* * Called with local BH disabled and the pool lock held. */ @@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a, \ stackptr = _stack; \ *stackptr++ = &_base->root; \ - for (u = rcu_dereference_protected(_base->root, \ - lockdep_is_held(&_base->lock)); \ + for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty; ) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ @@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a, else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&_base->lock)); \ + u = rcu_deref_locked(*v, _base); \ } \ u; \ }) /* - * Called with rcu_read_lock_bh() + * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall * in an endless loop. * But every pointer we follow is guaranteed to be valid thanks to RCU. * We exit from this function if number of links exceeds PEER_MAXDEPTH */ -static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) +static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, + struct inet_peer_base *base) { - struct inet_peer *u = rcu_dereference_bh(base->root); + struct inet_peer *u = rcu_dereference(base->root); int count = 0; while (u != peer_avl_empty) { @@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, return u; } if (cmp == -1) - u = rcu_dereference_bh(u->avl_left); + u = rcu_dereference(u->avl_left); else - u = rcu_dereference_bh(u->avl_right); + u = rcu_dereference(u->avl_right); if (unlikely(++count == PEER_MAXDEPTH)) break; } @@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + for (u = rcu_deref_locked(*v, base); \ u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + u = rcu_deref_locked(*v, base); \ } \ u; \ }) @@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], while (stackend > stack) { nodep = *--stackend; - node = rcu_dereference_protected(*nodep, - lockdep_is_held(&base->lock)); - l = rcu_dereference_protected(node->avl_left, - lockdep_is_held(&base->lock)); - r = rcu_dereference_protected(node->avl_right, - lockdep_is_held(&base->lock)); + node = rcu_deref_locked(*nodep, base); + l = rcu_deref_locked(node->avl_left, base); + r = rcu_deref_locked(node->avl_right, base); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = rcu_dereference_protected(l->avl_left, - lockdep_is_held(&base->lock)); - lr = rcu_dereference_protected(l->avl_right, - lockdep_is_held(&base->lock)); + ll = rcu_deref_locked(l->avl_left, base); + lr = rcu_deref_locked(l->avl_right, base); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ @@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], l->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = rcu_dereference_protected(lr->avl_left, - lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ - lrr = rcu_dereference_protected(lr->avl_right, - lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */ + lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ + lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ @@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = rcu_dereference_protected(r->avl_right, - lockdep_is_held(&base->lock)); - rl = rcu_dereference_protected(r->avl_left, - lockdep_is_held(&base->lock)); + rr = rcu_deref_locked(r->avl_right, base); + rl = rcu_deref_locked(r->avl_left, base); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ @@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], r->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rcu_dereference_protected(rl->avl_right, - lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ - rll = rcu_dereference_protected(rl->avl_left, - lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */ + rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ + rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ @@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) do_free = 0; - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); /* Check the reference counter. It was artificially incremented by 1 * in cleanup() function to prevent sudden disappearing. If we can * atomically (because of lockless readers) take this last reference, @@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p, base); - BUG_ON(rcu_dereference_protected(*stackptr[-1], - lockdep_is_held(&base->lock)) != t); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); **--stackptr = t->avl_left; /* t is removed, t->daddr > x->daddr for any * x in p->avl_left subtree. @@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) base->total--; do_free = 1; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (do_free) - call_rcu_bh(&p->rcu, inetpeer_free_rcu); + call_rcu(&p->rcu, inetpeer_free_rcu); else /* The node is used again. Decrease the reference counter * back. The loop "cleanup -> unlink_from_unused @@ -477,13 +464,17 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; + unsigned int sequence; + int invalidated; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock_bh(); - p = lookup_rcu_bh(daddr, base); - rcu_read_unlock_bh(); + rcu_read_lock(); + sequence = read_seqbegin(&base->lock); + p = lookup_rcu(daddr, base); + invalidated = read_seqretry(&base->lock, sequence); + rcu_read_unlock(); if (p) { /* The existing node has been found. @@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) return p; } + /* If no writer did a change during our lookup, we can return early. */ + if (!create && !invalidated) + return NULL; + /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); p = lookup(daddr, stack, base); if (p != peer_avl_empty) { atomic_inc(&p->refcnt); - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); /* Remove the entry from unused list if it was there. */ unlink_from_unused(p); return p; @@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); p->tcp_ts_stamp = 0; + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + p->rate_last = 0; + p->pmtu_expires = 0; + p->pmtu_orig = 0; + memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->unused); @@ -519,7 +520,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) link_to_pool(p, base); base->total++; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (base->total >= inet_peer_threshold) /* Remove one less-recently-used entry. */ @@ -579,3 +580,44 @@ void inet_putpeer(struct inet_peer *p) local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_putpeer); + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the inet_peer entries now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same inet_peer fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) +{ + unsigned long now, token; + bool rc = false; + + if (!peer) + return true; + + token = peer->rate_tokens; + now = jiffies; + token += now - peer->rate_last; + peer->rate_last = now; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; + rc = true; + } + peer->rate_tokens = token; + return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d1d0e2c256fc..da5941f18c3c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -769,19 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src = tiph->saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_GRE, - .fl_gre_key = tunnel->parms.o_key - }; - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } + rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; } tdev = rt->dst.dev; @@ -945,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_GRE, - .fl_gre_key = tunnel->parms.o_key - }; - struct rtable *rt; - - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + struct rtable *rt = ip_route_output_gre(dev_net(dev), + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1207,17 +1196,14 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { - .oif = t->parms.link, - .fl4_dst = t->parms.iph.daddr, - .fl4_src = t->parms.iph.saddr, - .fl4_tos = RT_TOS(t->parms.iph.tos), - .proto = IPPROTO_GRE, - .fl_gre_key = t->parms.o_key - }; - struct rtable *rt; - - if (ip_route_output_key(dev_net(dev), &rt, &fl)) + struct rtable *rt = ip_route_output_gre(dev_net(dev), + t->parms.iph.daddr, + t->parms.iph.saddr, + t->parms.o_key, + RT_TOS(t->parms.iph.tos), + t->parms.link); + + if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb7..d7b2b0987a3b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1906fa35860c..28a736f3442f 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -140,11 +140,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) } else { dopt->ts_needtime = 0; - if (soffset + 8 <= optlen) { + if (soffset + 7 <= optlen) { __be32 addr; - memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { + memcpy(&addr, dptr+soffset-1, 4); + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04c7b3ba6b39..67f241b97649 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb) if(opt && opt->srr) daddr = opt->faddr; - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport }; - - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times - * itself out. - */ - security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) - goto no_route; - } + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), sk, + daddr, inet->inet_saddr, + inet->inet_dport, + inet->inet_sport, + sk->sk_protocol, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); @@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy) } static inline int ip_ufo_append_data(struct sock *sk, + struct sk_buff_head *queue, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, @@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk, * device, so create one single skb packet containing complete * udp datagram */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + if ((skb = skb_peek_tail(queue)) == NULL) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); @@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - sk->sk_sndmsg_off = 0; /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); } return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } -/* - * ip_append_data() and ip_append_page() can make one large IP datagram - * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Each piece can be a page - * or non-page data. - * - * Not only UDP, other transport protocols - e.g. raw sockets - can use - * this interface potentially. - * - * LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags) +static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, + struct inet_cork *cork, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - struct ip_options *opt = NULL; + struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; @@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; - struct rtable *rt; + struct rtable *rt = (struct rtable *)cork->dst; - if (flags&MSG_PROBE) - return 0; - - if (skb_queue_empty(&sk->sk_write_queue)) { - /* - * setup for corking. - */ - opt = ipc->opt; - if (opt) { - if (inet->cork.opt == NULL) { - inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); - if (unlikely(inet->cork.opt == NULL)) - return -ENOBUFS; - } - memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); - inet->cork.flags |= IPCORK_OPT; - inet->cork.addr = ipc->addr; - } - rt = *rtp; - if (unlikely(!rt)) - return -EFAULT; - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; - inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : - dst_mtu(rt->dst.path); - inet->cork.dst = &rt->dst; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - exthdrlen = rt->dst.header_len; - length += exthdrlen; - transhdrlen += exthdrlen; - } else { - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + exthdrlen = transhdrlen ? rt->dst.header_len : 0; + length += exthdrlen; + transhdrlen += exthdrlen; + mtu = cork->fragsize; - transhdrlen = 0; - exthdrlen = 0; - mtu = inet->cork.fragsize; - } hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - if (inet->cork.length + length > 0xFFFF - fragheaderlen) { + if (cork->length + length > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; @@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(&sk->sk_write_queue); + skb = skb_peek_tail(queue); - inet->cork.length += length; + cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { - err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, - flags); + err = ip_ufo_append_data(sk, queue, getfrag, from, length, + hh_len, fragheaderlen, transhdrlen, + mtu, flags); if (err) goto error; return 0; @@ -961,7 +905,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->tx_flags = 0; + cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -972,7 +916,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - skb_shinfo(skb)->tx_flags = ipc->tx_flags; + skb_shinfo(skb)->tx_flags = cork->tx_flags; /* * Find where to start putting bytes. @@ -1009,7 +953,7 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); continue; } @@ -1029,8 +973,8 @@ alloc_new_skb: } else { int i = skb_shinfo(skb)->nr_frags; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; + struct page *page = cork->page; + int off = cork->off; unsigned int left; if (page && (left = PAGE_SIZE - off) > 0) { @@ -1042,7 +986,7 @@ alloc_new_skb: goto error; } get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + skb_fill_page_desc(skb, i, page, off, 0); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { @@ -1053,8 +997,8 @@ alloc_new_skb: err = -ENOMEM; goto error; } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; + cork->page = page; + cork->off = 0; skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; @@ -1066,7 +1010,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - sk->sk_sndmsg_off += copy; + cork->off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; @@ -1080,11 +1024,87 @@ alloc_new_skb: return 0; error: - inet->cork.length -= length; + cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, + struct ipcm_cookie *ipc, struct rtable **rtp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt; + struct rtable *rt; + + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (cork->opt == NULL) { + cork->opt = kmalloc(sizeof(struct ip_options) + 40, + sk->sk_allocation); + if (unlikely(cork->opt == NULL)) + return -ENOBUFS; + } + memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen); + cork->flags |= IPCORK_OPT; + cork->addr = ipc->addr; + } + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + cork->dst = &rt->dst; + cork->length = 0; + cork->tx_flags = ipc->tx_flags; + cork->page = NULL; + cork->off = 0; + + return 0; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + err = ip_setup_cork(sk, &inet->cork, ipc, rtp); + if (err) + return err; + } else { + transhdrlen = 0; + } + + return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag, + from, length, transhdrlen, flags); +} + ssize_t ip_append_page(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1228,40 +1248,41 @@ error: return err; } -static void ip_cork_release(struct inet_sock *inet) +static void ip_cork_release(struct inet_cork *cork) { - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - dst_release(inet->cork.dst); - inet->cork.dst = NULL; + cork->flags &= ~IPCORK_OPT; + kfree(cork->opt); + cork->opt = NULL; + dst_release(cork->dst); + cork->dst = NULL; } /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)inet->cork.dst; + struct rtable *rt = (struct rtable *)cork->dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; - if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + if ((skb = __skb_dequeue(queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); - while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); @@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk) ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; @@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk) iph->ihl = 5; if (opt) { iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, inet->cork.addr, rt, 0); + ip_options_build(skb, opt, cork->addr, rt, 0); } iph->tos = inet->tos; iph->frag_off = df; @@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk) * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ - inet->cork.dst = NULL; + cork->dst = NULL; skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - /* Netfilter gets whole the not fragmented skb. */ + ip_cork_release(cork); +out: + return skb; +} + +int ip_send_skb(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + int err; + err = ip_local_out(skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) - goto error; + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } -out: - ip_cork_release(inet); return err; +} -error: - IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); - goto out; +int ip_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + + skb = ip_finish_skb(sk); + if (!skb) + return 0; + + /* Netfilter gets whole the not fragmented skb. */ + return ip_send_skb(skb); } /* * Throw away all pending data on the socket. */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb; - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); - ip_cork_release(inet_sk(sk)); + ip_cork_release(cork); +} + +void ip_flush_pending_frames(struct sock *sk) +{ + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); } +struct sk_buff *ip_make_skb(struct sock *sk, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_cork cork = {}; + struct sk_buff_head queue; + int err; + + if (flags & MSG_PROBE) + return NULL; + + __skb_queue_head_init(&queue); + + err = ip_setup_cork(sk, &cork, ipc, rtp); + if (err) + return ERR_PTR(err); + + err = __ip_append_data(sk, &queue, &cork, getfrag, + from, length, transhdrlen, flags); + if (err) { + __ip_flush_pending_frames(sk, &queue, &cork); + return ERR_PTR(err); + } + + return __ip_make_skb(sk, &queue, &cork); +} /* * Fetch data from kernel space and fill in checksum if needed. @@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } { - struct flowi fl = { .oif = arg->bound_dev_if, - .fl4_dst = daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl_ip_sport = tcp_hdr(skb)->dest, - .fl_ip_dport = tcp_hdr(skb)->source, - .proto = sk->sk_protocol, - .flags = ip_reply_arg_flowi_flags(arg) }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) + struct flowi4 fl4 = { + .flowi4_oif = arg->bound_dev_if, + .daddr = daddr, + .saddr = rt->rt_spec_dst, + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .fl4_sport = tcp_hdr(skb)->dest, + .fl4_dport = tcp_hdr(skb)->source, + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = ip_reply_arg_flowi_flags(arg), + }; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) return; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index a5f58e7cbb26..bfc17c5914e7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src= tiph->saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } + rt = ip_route_output_ports(dev_net(dev), NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPIP, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; } tdev = rt->dst.dev; @@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - struct rtable *rt; - - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8b65a12654e7..1f62eaeb6de4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -148,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return NULL; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, }; int err; - err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; @@ -283,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return net->ipv4.mrt; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; @@ -435,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; - struct flowi fl = { - .oif = dev->ifindex, - .iif = skb->skb_iif, - .mark = skb->mark, + struct flowi4 fl4 = { + .flowi4_oif = dev->ifindex, + .flowi4_iif = skb->skb_iif, + .flowi4_mark = skb->mark, }; int err; - err = ipmr_fib_lookup(net, &fl, &mrt); + err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; @@ -1611,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, #endif if (vif->flags & VIFF_TUNNEL) { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = vif->remote, - .fl4_src = vif->local, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, NULL, + vif->remote, vif->local, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = iph->daddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, NULL, iph->daddr, 0, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; } @@ -1793,6 +1788,24 @@ dont_forward: return 0; } +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt) +{ + struct flowi4 fl4 = { + .daddr = rt->rt_key_dst, + .saddr = rt->rt_key_src, + .flowi4_tos = rt->rt_tos, + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + struct mr_table *mrt; + int err; + + err = ipmr_fib_lookup(net, &fl4, &mrt); + if (err) + return ERR_PTR(err); + return mrt; +} /* * Multicast packets for forwarding arrive here @@ -1805,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; - int err; /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -1813,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; - err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); - if (err < 0) { + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) { kfree_skb(skb); - return err; + return PTR_ERR(mrt); } - if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) @@ -1946,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb) pim = igmp_hdr(skb); - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) goto drop; - if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -1978,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb) csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) goto drop; - if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 994a1f29ebbc..f3c0b549b8e1 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - struct flowi fl = {}; + struct flowi4 fl4 = {}; unsigned long orefdst; unsigned int hh_len; unsigned int type; @@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ if (addr_type == RTN_LOCAL) { - fl.fl4_dst = iph->daddr; + fl4.daddr = iph->daddr; if (type == RTN_LOCAL) - fl.fl4_src = iph->saddr; - fl.fl4_tos = RT_TOS(iph->tos); - fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; - fl.mark = skb->mark; - fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; /* Drop old route. */ @@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - fl.fl4_dst = iph->saddr; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.daddr = iph->saddr; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; orefdst = skb->_skb_refdst; @@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET) == 0) { + xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); - if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_set(skb, dst); } @@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_drop(skb); @@ -219,7 +223,11 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) { - return ip_route_output_key(&init_net, (struct rtable **)dst, fl); + struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + *dst = &rt->dst; + return 0; } static const struct nf_afinfo nf_ip_afinfo = { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5f..1dfc18a03fd4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -64,16 +64,6 @@ config IP_NF_IPTABLES if IP_NF_IPTABLES # The matches. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED - help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. - config IP_NF_MATCH_AH tristate '"ah" match support' depends on NETFILTER_ADVANCED @@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19eb59d01037..dca2082ec683 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e855fffaed95..4b5d457c2d76 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1065,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1333,6 +1335,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1486,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1738,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 652efea013dc..ffcea0d1678e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) { + if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " @@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else @@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1261,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1664,6 +1666,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1805,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -2034,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IPT_SO_GET_REVISION_TARGET) target = 1; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a4897655..d609ac3cb9a4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ @@ -669,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, char buffer[PROC_WRITELEN+1]; unsigned long nodenum; - if (copy_from_user(buffer, input, PROC_WRITELEN)) + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) return -EFAULT; + buffer[size] = 0; if (*buffer == '+') { nodenum = simple_strtoul(buffer+1, NULL, 10); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8fda2e9..d76d6c9ed946 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c deleted file mode 100644 index db8bff0fb86d..000000000000 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * iptables module to match inet_addr_type() of an ip. - * - * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> - * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ipt_addrtype.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: address type match for IPv4"); - -static inline bool match_type(struct net *net, const struct net_device *dev, - __be32 addr, u_int16_t mask) -{ - return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); -} - -static bool -addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - bool ret = true; - - if (info->source) - ret &= match_type(net, NULL, iph->saddr, info->source) ^ - info->invert_source; - if (info->dest) - ret &= match_type(net, NULL, iph->daddr, info->dest) ^ - info->invert_dest; - - return ret; -} - -static bool -addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info_v1 *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - const struct net_device *dev = NULL; - bool ret = true; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) - dev = par->in; - else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) - dev = par->out; - - if (info->source) - ret &= match_type(net, dev, iph->saddr, info->source) ^ - (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); - if (ret && info->dest) - ret &= match_type(net, dev, iph->daddr, info->dest) ^ - !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); - return ret; -} - -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) -{ - struct ipt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("both incoming and outgoing " - "interface limitation cannot be selected\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_LOCAL_IN)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("output interface limitation " - "not valid in PREROUTING and INPUT\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { - pr_info("input interface limitation " - "not valid in POSTROUTING and OUTPUT\n"); - return -EINVAL; - } - - return 0; -} - -static struct xt_match addrtype_mt_reg[] __read_mostly = { - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .match = addrtype_mt_v0, - .matchsize = sizeof(struct ipt_addrtype_info), - .me = THIS_MODULE - }, - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .revision = 1, - .match = addrtype_mt_v1, - .checkentry = addrtype_mt_checkentry_v1, - .matchsize = sizeof(struct ipt_addrtype_info_v1), - .me = THIS_MODULE - } -}; - -static int __init addrtype_mt_init(void) -{ - return xt_register_matches(addrtype_mt_reg, - ARRAY_SIZE(addrtype_mt_reg)); -} - -static void __exit addrtype_mt_exit(void) -{ - xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); -} - -module_init(addrtype_mt_init); -module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f293..aef5d1fbe77d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 63f60fc5d26a..5585980fce2e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f06df0..703f366fd235 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787ce1a71..21bcf471b25a 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; - int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; } - /* Place in source hash if this is the first time. */ - if (have_to_hash) { + if (maniptype == IP_NAT_MANIP_SRC) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), @@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } @@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } @@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (nat == NULL || nat->ct == NULL) return; - NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); @@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) struct nf_conn_nat *old_nat = old; struct nf_conn *ct = old_nat->ct; - if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) return; spin_lock_bh(&nf_nat_lock); - new_nat->ct = ct; hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } @@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a56..8812a02078ab 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 95481fee8bdb..7317bdf1d457 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -31,6 +31,7 @@ #ifdef CONFIG_XFRM static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) { + struct flowi4 *fl4 = &fl->u.ip4; const struct nf_conn *ct; const struct nf_conntrack_tuple *t; enum ip_conntrack_info ctinfo; @@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) statusbit = IPS_SRC_NAT; if (ct->status & statusbit) { - fl->fl4_dst = t->dst.u3.ip; + fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_dport = t->dst.u.tcp.port; + fl4->fl4_dport = t->dst.u.tcp.port; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { - fl->fl4_src = t->src.u3.ip; + fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_sport = t->src.u.tcp.port; + fl4->fl4_sport = t->src.u.tcp.port; } } #endif diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6390ba299b3d..2d3c72e5bbbf 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -402,7 +402,7 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; @@ -418,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) if (!iov) continue; - switch (fl->proto) { + switch (fl4->flowi4_proto) { case IPPROTO_ICMP: /* check if one-byte field is readable or not. */ if (iov->iov_base && iov->iov_len < 1) @@ -433,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) code = iov->iov_base; if (type && code) { - if (get_user(fl->fl_icmp_type, type) || - get_user(fl->fl_icmp_code, code)) + if (get_user(fl4->fl4_icmp_type, type) || + get_user(fl4->fl4_icmp_code, code)) return -EFAULT; probed = 1; } @@ -548,25 +548,31 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = inet->hdrincl ? IPPROTO_RAW : - sk->sk_protocol, - }; + struct flowi4 fl4 = { + .flowi4_oif = ipc.oif, + .flowi4_mark = sk->sk_mark, + .daddr = daddr, + .saddr = saddr, + .flowi4_tos = tos, + .flowi4_proto = (inet->hdrincl ? + IPPROTO_RAW : + sk->sk_protocol), + .flowi4_flags = FLOWI_FLAG_CAN_SLEEP, + }; if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl, msg); + err = raw_probe_proto_opt(&fl4, msg); if (err) goto done; } - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto done; + } } - if (err) - goto done; err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6ed6603c2f6d..4b0c81180804 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -109,8 +109,8 @@ #include <linux/sysctl.h> #endif -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) +#define RT_FL_TOS(oldflp4) \ + ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) #define IP_MAX_MTU 0xFFF0 @@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; - /* * Interface to generic destination cache. */ @@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { } +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->peer) + rt_bind_peer(rt, 1); + + peer = rt->peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else { + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + } + } + return p; +} + static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = { .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .default_mtu = ipv4_default_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, @@ -171,7 +204,7 @@ static struct dst_ops ipv4_dst_ops = { const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), - r->fl.fl4_tos, + r->rt_tos, r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, @@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; + (rth->peer && rth->peer->pmtu_expires); } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t if (atomic_read(&rth->dst.__refcnt)) goto out; - ret = 1; - if (rth->dst.expires && - time_after_eq(jiffies, rth->dst.expires)) - goto out; - age = jiffies - rth->dst.lastuse; - ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; @@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count; } -static inline bool compare_hash_inputs(const struct flowi *fl1, - const struct flowi *fl2) +static inline bool compare_hash_inputs(const struct rtable *rt1, + const struct rtable *rt2) { - return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->iif ^ fl2->iif)) == 0); + return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0); } -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) { - return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | - (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; + return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_mark ^ rt2->rt_mark) | + (rt1->rt_tos ^ rt2->rt_tos) | + (rt1->rt_oif ^ rt2->rt_oif) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) @@ -786,104 +813,13 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) const struct rtable *aux = head; while (aux != rth) { - if (compare_hash_inputs(&aux->fl, &rth->fl)) + if (compare_hash_inputs(aux, rth)) return 0; aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Pertubation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head) return length >> FRACT_BITS; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb, int ifindex) +static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, + struct sk_buff *skb, int ifindex) { struct rtable *rth, *cand; struct rtable __rcu **rthp, **candp; @@ -1120,7 +1056,7 @@ restart: printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); ip_rt_put(rt); - return err; + return ERR_PTR(err); } } @@ -1137,7 +1073,7 @@ restart: rt_free(rth); continue; } - if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { + if (compare_keys(rth, rt) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->dst.rt_next; /* @@ -1157,11 +1093,9 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); - if (rp) - *rp = rth; - else + if (skb) skb_dst_set(skb, &rth->dst); - return 0; + return rth; } if (!atomic_read(&rth->dst.__refcnt)) { @@ -1202,7 +1136,7 @@ restart: rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); - hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, ifindex, rt_genid(net)); goto restart; } @@ -1218,7 +1152,7 @@ restart: if (err != -ENOBUFS) { rt_drop(rt); - return err; + return ERR_PTR(err); } /* Neighbour tables are full and nothing @@ -1239,7 +1173,7 @@ restart: if (net_ratelimit()) printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } } @@ -1265,11 +1199,16 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); skip_hashing: - if (rp) - *rp = rt; - else + if (skb) skb_dst_set(skb, &rt->dst); - return 0; + return rt; +} + +static atomic_t __rt_peer_genid = ATOMIC_INIT(0); + +static u32 rt_peer_genid(void) +{ + return atomic_read(&__rt_peer_genid); } void rt_bind_peer(struct rtable *rt, int create) @@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create) if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); + else + rt->rt_peer_genid = rt_peer_genid(); } /* @@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt) void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { - int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth; - struct rtable __rcu **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; + struct inet_peer *peer; struct net *net; if (!in_dev) @@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ipv4_is_zeronet(new_gw)) goto reject_redirect; - if (!rt_caching(net)) - goto reject_redirect; - if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - rt_is_expired(rth) || - !net_eq(dev_net(rth->dst.dev), net)) { - rthp = &rth->dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->dst.error || - rth->rt_gateway != old_gw || - rth->dst.dev != dev) - break; - - dst_hold(&rth->dst); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - return; - } - - /* Copy all the information. */ - *rt = *rth; - rt->dst.__use = 1; - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.child = NULL; - if (rt->dst.dev) - dev_hold(rt->dst.dev); - rt->dst.obsolete = -1; - rt->dst.lastuse = jiffies; - rt->dst.path = &rt->dst; - rt->dst.neighbour = NULL; - rt->dst.hh = NULL; -#ifdef CONFIG_XFRM - rt->dst.xfrm = NULL; -#endif - rt->rt_genid = rt_genid(net); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } + peer = inet_getpeer_v4(daddr, 1); + if (peer) { + peer->redirect_learned.a4 = new_gw; - netevent.old = &rth->dst; - netevent.new = &rt->dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); + inet_putpeer(peer); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) - ip_rt_put(rt); - goto do_next; - } - do_next: - ; - } + atomic_inc(&__rt_peer_genid); } return; @@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->dst.expires && - time_after_eq(jiffies, rt->dst.expires))) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif, + } else if (rt->rt_flags & RTCF_REDIRECTED) { + unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + rt->rt_oif, rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", - &rt->rt_dst, rt->fl.fl4_tos); + &rt->rt_dst, rt->rt_tos); #endif rt_del(hash, rt); ret = NULL; + } else if (rt->peer && + rt->peer->pmtu_expires && + time_after_eq(jiffies, rt->peer->pmtu_expires)) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(dst, RTAX_MTU, + rt->peer->pmtu_orig); } } return ret; @@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; + struct inet_peer *peer; int log_martians; rcu_read_lock(); @@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + return; + } + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) - rt->dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (rt->dst.rate_tokens >= ip_rt_redirect_number) { - rt->dst.rate_last = jiffies; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->dst.rate_last + - (ip_rt_redirect_load << rt->dst.rate_tokens)))) { + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->dst.rate_last = jiffies; - ++rt->dst.rate_tokens; + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->dst.rate_tokens == ip_rt_redirect_number && + peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + bool send; int code; switch (rt->dst.error) { @@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb) break; } - now = jiffies; - rt->dst.rate_tokens += now - rt->dst.rate_last; - if (rt->dst.rate_tokens > ip_rt_error_burst) - rt->dst.rate_tokens = ip_rt_error_burst; - rt->dst.rate_last = now; - if (rt->dst.rate_tokens >= ip_rt_error_cost) { - rt->dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; @@ -1630,88 +1511,140 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { - int i, k; unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - int ikeys[2] = { dev->ifindex, 0 }; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; unsigned short est_mtu = 0; + struct inet_peer *peer; - for (k = 0; k < 2; k++) { - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - unsigned short mtu = new_mtu; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->rt_dst != daddr || - rth->rt_src != iph->saddr || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - dst_metric_locked(&rth->dst, RTAX_MTU) || - !net_eq(dev_net(rth->dst.dev), net) || - rt_is_expired(rth)) - continue; + peer = inet_getpeer_v4(iph->daddr, 1); + if (peer) { + unsigned short mtu = new_mtu; - if (new_mtu < 68 || new_mtu >= old_mtu) { + if (new_mtu < 68 || new_mtu >= old_mtu) { + /* BSD 4.2 derived systems incorrectly adjust + * tot_len by the IP header length, and report + * a zero MTU in the ICMP message. + */ + if (mtu == 0 && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + mtu = guess_mtu(old_mtu); + } - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= dst_mtu(&rth->dst) && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; - mtu = guess_mtu(old_mtu); - } - if (mtu <= dst_mtu(&rth->dst)) { - if (mtu < dst_mtu(&rth->dst)) { - dst_confirm(&rth->dst); - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(&rth->dst, - RTAX_LOCK); - mtu = ip_rt_min_pmtu; - lock |= (1 << RTAX_MTU); - dst_metric_set(&rth->dst, RTAX_LOCK, - lock); - } - dst_metric_set(&rth->dst, RTAX_MTU, mtu); - dst_set_expires(&rth->dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - rcu_read_unlock(); + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + est_mtu = mtu; + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; } + + inet_putpeer(peer); + + atomic_inc(&__rt_peer_genid); } return est_mtu ? : new_mtu; } +static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +{ + unsigned long expires = peer->pmtu_expires; + + if (time_before(jiffies, expires)) { + u32 orig_dst_mtu = dst_mtu(dst); + if (peer->pmtu_learned < orig_dst_mtu) { + if (!peer->pmtu_orig) + peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); + dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); + } + } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) + dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +} + static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst_mtu(dst) > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(dst, RTAX_LOCK); + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + + dst_confirm(dst); + + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (peer) { + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; + + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; + + atomic_inc(&__rt_peer_genid); + rt->rt_peer_genid = rt_peer_genid(); } - dst_metric_set(dst, RTAX_MTU, mtu); - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + check_peer_pmtu(dst, peer); } } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + + dst_confirm(&rt->dst); + + neigh_release(rt->dst.neighbour); + rt->dst.neighbour = NULL; + + rt->rt_gateway = peer->redirect_learned.a4; + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, + rt->dst.neighbour); + } + return 0; +} + static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - if (rt_is_expired((struct rtable *)dst)) + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) return NULL; + if (rt->rt_peer_genid != rt_peer_genid()) { + struct inet_peer *peer; + + if (!rt->peer) + rt_bind_peer(rt, 0); + + peer = rt->peer; + if (peer && peer->pmtu_expires) + check_peer_pmtu(dst, peer); + + if (peer && peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } + + rt->rt_peer_genid = rt_peer_genid(); + } return dst; } @@ -1720,6 +1653,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst) struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer; + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } if (peer) { rt->peer = NULL; inet_putpeer(peer); @@ -1734,8 +1671,14 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt) - dst_set_expires(&rt->dst, 0); + if (rt && + rt->peer && + rt->peer->pmtu_expires) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + } } static int ip_rt_bug(struct sk_buff *skb) @@ -1764,9 +1707,18 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt_is_output_route(rt)) src = rt->rt_src; else { + struct flowi4 fl4 = { + .daddr = rt->rt_key_dst, + .saddr = rt->rt_key_src, + .flowi4_tos = rt->rt_tos, + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) - src = FIB_RES_PREFSRC(res); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); @@ -1775,7 +1727,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1815,17 +1767,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst) return mtu; } -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, + struct fib_info *fi) +{ + struct inet_peer *peer; + int create = 0; + + /* If a peer entry exists for this destination, we must hook + * it up in order to get at cached metrics. + */ + if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) + create = 1; + + rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); + if (peer) { + rt->rt_peer_genid = rt_peer_genid(); + if (inet_metrics_new(peer)) + memcpy(peer->metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX); + dst_init_metrics(&rt->dst, peer->metrics, false); + + if (peer->pmtu_expires) + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + rt->rt_gateway = peer->redirect_learned.a4; + rt->rt_flags |= RTCF_REDIRECTED; + } + } else { + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } +} + +static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4, + const struct fib_result *res, + struct fib_info *fi, u16 type, u32 itag) { struct dst_entry *dst = &rt->dst; - struct fib_info *fi = res->fi; if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - dst_import_metrics(dst, fi->fib_metrics); -#ifdef CONFIG_NET_CLS_ROUTE + rt_init_metrics(rt, oldflp4, fi); +#ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } @@ -1835,13 +1824,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; + rt->rt_type = type; +} + +static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm) +{ + struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1); + if (rt) { + rt->dst.obsolete = -1; + + rt->dst.flags = DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0); + } + return rt; } /* called in rcu_read_lock() section */ @@ -1874,31 +1876,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output = ip_rt_bug; - rth->dst.obsolete = -1; - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; + rth->rt_iif = dev->ifindex; rth->dst.dev = init_net.loopback_dev; dev_hold(rth->dst.dev); - rth->fl.oif = 0; + rth->rt_oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_genid = rt_genid(dev_net(dev)); @@ -1916,7 +1912,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); + rth = rt_intern_hash(hash, rth, skb, dev->ifindex); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); e_nobufs: return -ENOBUFS; @@ -1959,7 +1958,7 @@ static void ip_handle_martian_source(struct net_device *dev, /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, - struct fib_result *res, + const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) @@ -2013,39 +2012,31 @@ static int __mkroute_input(struct sk_buff *skb, } } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; + rth->rt_iif = in_dev->dev->ifindex; rth->dst.dev = (out_dev)->dev; dev_hold(rth->dst.dev); - rth->fl.oif = 0; + rth->rt_oif = 0; rth->rt_spec_dst= spec_dst; - rth->dst.obsolete = -1; rth->dst.input = ip_forward; rth->dst.output = ip_output; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); - rt_set_nexthop(rth, res, itag); + rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); rth->rt_flags = flags; @@ -2057,7 +2048,7 @@ static int __mkroute_input(struct sk_buff *skb, static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi *fl, + const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { @@ -2066,8 +2057,8 @@ static int ip_mkroute_input(struct sk_buff *skb, unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ @@ -2076,9 +2067,12 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif, + hash = rt_hash(daddr, saddr, fl4->flowi4_iif, rt_genid(dev_net(rth->dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb, fl->iif); + rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); + if (IS_ERR(rth)) + return PTR_ERR(rth); + return 0; } /* @@ -2097,12 +2091,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct flowi fl = { .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .fl4_scope = RT_SCOPE_UNIVERSE, - .mark = skb->mark, - .iif = dev->ifindex }; + struct flowi4 fl4; unsigned flags = 0; u32 itag = 0; struct rtable * rth; @@ -2139,7 +2128,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - err = fib_lookup(net, &fl, &res); + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; @@ -2168,7 +2164,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -2190,29 +2186,23 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; + rth->rt_iif = dev->ifindex; rth->dst.dev = net->loopback_dev; dev_hold(rth->dst.dev); rth->rt_gateway = daddr; @@ -2225,8 +2215,11 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); + hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); + rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); goto out; no_route: @@ -2288,12 +2281,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | - ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | - (rth->fl.iif ^ iif) | - rth->fl.oif | - (rth->fl.fl4_tos ^ tos)) == 0 && - rth->fl.mark == skb->mark && + if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | + ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | + (rth->rt_iif ^ iif) | + rth->rt_oif | + (rth->rt_tos ^ tos)) == 0 && + rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { @@ -2326,8 +2319,8 @@ skip_cache: struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { - int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2351,98 +2344,91 @@ skip_cache: EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ -static int __mkroute_output(struct rtable **result, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, + const struct flowi4 *oldflp4, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + u32 tos = RT_FL_TOS(oldflp4); struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); + u16 type = res->type; + struct rtable *rth; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) - return -EINVAL; + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl->fl4_dst)) - res->type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) - return -EINVAL; + return ERR_PTR(-EINVAL); - if (res->type == RTN_BROADCAST) { + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - res->fi = NULL; - } else if (res->type == RTN_MULTICAST) { + fi = NULL; + } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr, + oldflp4->flowi4_proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) - return -ENOBUFS; - - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; + return ERR_PTR(-ENOBUFS); + + rth->rt_key_dst = oldflp4->daddr; + rth->rt_tos = tos; + rth->rt_key_src = oldflp4->saddr; + rth->rt_oif = oldflp4->flowi4_oif; + rth->rt_mark = oldflp4->flowi4_mark; + rth->rt_dst = fl4->daddr; + rth->rt_src = fl4->saddr; + rth->rt_iif = 0; /* get references to the devices that are to be hold by the routing cache entry */ rth->dst.dev = dev_out; dev_hold(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; + rth->rt_gateway = fl4->daddr; + rth->rt_spec_dst= fl4->saddr; rth->dst.output=ip_output; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; + rth->rt_spec_dst = fl4->daddr; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; + rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(oldflp4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } @@ -2450,31 +2436,10 @@ static int __mkroute_output(struct rtable **result, #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, oldflp4, res, fi, type, 0); rth->rt_flags = flags; - *result = rth; - return 0; -} - -/* called with rcu_read_lock() */ -static int ip_mkroute_output(struct rtable **rp, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); - } - - return err; + return rth; } /* @@ -2482,34 +2447,36 @@ static int ip_mkroute_output(struct rtable **rp, * called with rcu_read_lock(); */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, - const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .fl4_dst = oldflp->fl4_dst, - .fl4_src = oldflp->fl4_src, - .fl4_tos = tos & IPTOS_RT_MASK, - .fl4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), - .mark = oldflp->mark, - .iif = net->loopback_dev->ifindex, - .oif = oldflp->oif }; +static struct rtable *ip_route_output_slow(struct net *net, + const struct flowi4 *oldflp4) +{ + u32 tos = RT_FL_TOS(oldflp4); + struct flowi4 fl4; struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; - int err; - + struct rtable *rth; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (oldflp->fl4_src) { - err = -EINVAL; - if (ipv4_is_multicast(oldflp->fl4_src) || - ipv4_is_lbcast(oldflp->fl4_src) || - ipv4_is_zeronet(oldflp->fl4_src)) + fl4.flowi4_oif = oldflp4->flowi4_oif; + fl4.flowi4_iif = net->loopback_dev->ifindex; + fl4.flowi4_mark = oldflp4->flowi4_mark; + fl4.daddr = oldflp4->daddr; + fl4.saddr = oldflp4->saddr; + fl4.flowi4_tos = tos & IPTOS_RT_MASK; + fl4.flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (oldflp4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(oldflp4->saddr) || + ipv4_is_lbcast(oldflp4->saddr) || + ipv4_is_zeronet(oldflp4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2520,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 && - (ipv4_is_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst))) { + if (oldflp4->flowi4_oif == 0 && + (ipv4_is_multicast(oldflp4->daddr) || + ipv4_is_lbcast(oldflp4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = __ip_dev_find(net, oldflp->fl4_src, false); + dev_out = __ip_dev_find(net, oldflp4->saddr, false); if (dev_out == NULL) goto out; @@ -2543,60 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; goto make_route; } - if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - if (!__ip_dev_find(net, oldflp->fl4_src, false)) + if (!__ip_dev_find(net, oldflp4->saddr, false)) goto out; } } - if (oldflp->oif) { - dev_out = dev_get_by_index_rcu(net, oldflp->oif); - err = -ENODEV; + if (oldflp4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - if (ipv4_is_local_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst)) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (ipv4_is_local_multicast(oldflp4->daddr) || + ipv4_is_lbcast(oldflp4->daddr)) { + if (!fl4.saddr) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (ipv4_is_multicast(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_HOST); + if (!fl4.saddr) { + if (ipv4_is_multicast(oldflp4->daddr)) + fl4.saddr = inet_select_addr(dev_out, 0, + fl4.flowi4_scope); + else if (!oldflp4->daddr) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (!fl4.daddr) { + fl4.daddr = fl4.saddr; + if (!fl4.daddr) + fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - fl.oif = net->loopback_dev->ifindex; + fl4.flowi4_oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl, &res)) { + if (fib_lookup(net, &fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + if (oldflp4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2615,90 +2582,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (fl4.saddr == 0) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) { + if (!fl4.saddr) { if (res.fi->fib_prefsrc) - fl.fl4_src = res.fi->fib_prefsrc; + fl4.saddr = res.fi->fib_prefsrc; else - fl.fl4_src = fl.fl4_dst; + fl4.saddr = fl4.daddr; } dev_out = net->loopback_dev; - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4.saddr) + fl4.saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags); + if (!IS_ERR(rth)) { + unsigned int hash; -out: return err; + hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif, + rt_genid(dev_net(dev_out))); + rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif); + } + +out: + rcu_read_unlock(); + return rth; } -int __ip_route_output_key(struct net *net, struct rtable **rp, - const struct flowi *flp) +struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4) { - unsigned int hash; - int res; struct rtable *rth; + unsigned int hash; if (!rt_caching(net)) goto slow_output; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); + hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && + if (rth->rt_key_dst == flp4->daddr && + rth->rt_key_src == flp4->saddr && rt_is_output_route(rth) && - rth->fl.oif == flp->oif && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & + rth->rt_oif == flp4->flowi4_oif && + rth->rt_mark == flp4->flowi4_mark && + !((rth->rt_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); - *rp = rth; - return 0; + return rth; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); slow_output: - rcu_read_lock(); - res = ip_route_output_slow(net, rp, flp); - rcu_read_unlock(); - return res; + return ip_route_output_slow(net, flp4); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2726,17 +2696,14 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .update_pmtu = ipv4_rt_blackhole_update_pmtu, }; - -static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = *rp; - struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1); + struct rtable *ort = (struct rtable *) dst_orig; if (rt) { struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; @@ -2746,7 +2713,12 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi if (new->dev) dev_hold(new->dev); - rt->fl = ort->fl; + rt->rt_key_dst = ort->rt_key_dst; + rt->rt_key_src = ort->rt_key_src; + rt->rt_tos = ort->rt_tos; + rt->rt_iif = ort->rt_iif; + rt->rt_oif = ort->rt_oif; + rt->rt_mark = ort->rt_mark; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; @@ -2759,46 +2731,40 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi rt->peer = ort->peer; if (rt->peer) atomic_inc(&rt->peer->refcnt); + rt->fi = ort->fi; + if (rt->fi) + atomic_inc(&rt->fi->fib_clntref); dst_free(new); } - dst_release(&(*rp)->dst); - *rp = rt; - return rt ? 0 : -ENOMEM; + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, - struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - int err; + struct rtable *rt = __ip_route_output_key(net, flp4); - if ((err = __ip_route_output_key(net, rp, flp)) != 0) - return err; + if (IS_ERR(rt)) + return rt; - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, - flags ? XFRM_LOOKUP_WAIT : 0); - if (err == -EREMOTE) - err = ipv4_dst_blackhole(net, rp, flp); - - return err; + if (flp4->flowi4_proto) { + if (!flp4->saddr) + flp4->saddr = rt->rt_src; + if (!flp4->daddr) + flp4->daddr = rt->rt_dst; + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); } - return 0; + return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) -{ - return ip_route_output_flow(net, rp, flp, NULL, 0); -} -EXPORT_SYMBOL(ip_route_output_key); - static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) @@ -2817,7 +2783,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = rt->rt_tos; r->rtm_table = RT_TABLE_MAIN; NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); r->rtm_type = rt->rt_type; @@ -2829,19 +2795,19 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - if (rt->fl.fl4_src) { + if (rt->rt_key_src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); + NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt_is_input_route(rt)) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) + else if (rt->rt_src != rt->rt_key_src) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); if (rt->rt_dst != rt->rt_gateway) @@ -2850,11 +2816,12 @@ static int rt_fill_info(struct net *net, if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; - if (rt->fl.mark) - NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); + if (rt->rt_mark) + NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; - expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + expires = (rt->peer && rt->peer->pmtu_expires) ? + rt->peer->pmtu_expires - jiffies : 0; if (rt->peer) { inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; @@ -2884,7 +2851,7 @@ static int rt_fill_info(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); + NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); } if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, @@ -2958,14 +2925,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - struct flowi fl = { - .fl4_dst = dst, - .fl4_src = src, - .fl4_tos = rtm->rtm_tos, - .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - .mark = mark, + struct flowi4 fl4 = { + .daddr = dst, + .saddr = src, + .flowi4_tos = rtm->rtm_tos, + .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, + .flowi4_mark = mark, }; - err = ip_route_output_key(net, &rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } if (err) @@ -3248,6 +3219,8 @@ static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); return 0; } @@ -3256,9 +3229,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3274,7 +3247,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); @@ -3311,14 +3284,6 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 47519205a014..8b44c6d2a79b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi fl = { .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = IPPROTO_TCP, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = th->dest, - .fl_ip_dport = th->source }; - security_req_classify_flow(req, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) { + struct flowi4 fl4 = { + .flowi4_mark = sk->sk_mark, + .daddr = ((opt && opt->srr) ? + opt->faddr : ireq->rmt_addr), + .saddr = ireq->loc_addr, + .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_proto = IPPROTO_TCP, + .flowi4_flags = inet_sk_flowi_flags(sk), + .fl4_sport = th->dest, + .fl4_dport = th->source, + }; + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6c11eece262c..b22d45010545 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) else answ = tp->write_seq - tp->snd_una; break; + case SIOCOUTQNSD: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_nxt; + break; default: return -ENOIOCTLCMD; } @@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); - TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); - TCP_CHECK_TIMER(sk); release_sock(sk); return res; } @@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; lock_sock(sk); - TCP_CHECK_TIMER(sk); flags = msg->msg_flags; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -1104,7 +1110,6 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; @@ -1123,7 +1128,6 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); - TCP_CHECK_TIMER(sk); release_sock(sk); return err; } @@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - TCP_CHECK_TIMER(sk); - err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; @@ -1767,12 +1769,10 @@ skip_copy: /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; out: - TCP_CHECK_TIMER(sk); release_sock(sk); return err; @@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 3b53fd1af23f..6187eb4d1dcf 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) } -static struct tcp_congestion_ops bictcp = { +static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f29fa6..34340c9c95fa 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -39,7 +39,7 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (2U<<3) +#define HYSTART_DELAY_MIN (4U<<3) #define HYSTART_DELAY_MAX (16U<<3) #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) @@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -85,7 +88,7 @@ struct bictcp { u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay */ + u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -95,7 +98,7 @@ struct bictcp { u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ - u32 last_jiffies; /* last time when the ACK spacing is close */ + u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; @@ -116,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 + return ktime_to_ms(ktime_get_real()); +#else + return jiffies_to_msecs(jiffies); +#endif +} + static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_jiffies = jiffies; + ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; ca->curr_rtt = 0; ca->sample_cnt = 0; @@ -236,8 +248,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) */ /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) - << BICTCP_HZ) / HZ; + t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) + - ca->epoch_start) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -258,6 +270,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 100 * cwnd; /* very small increment*/ } + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->loss_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -339,12 +358,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (!(ca->found & hystart_detect)) { - u32 curr_jiffies = jiffies; + u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ - if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { - ca->last_jiffies = curr_jiffies; - if (curr_jiffies - ca->round_start >= ca->delay_min>>4) + if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + ca->last_ack = now; + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) ca->found |= HYSTART_ACK_TRAIN; } @@ -391,7 +410,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = usecs_to_jiffies(rtt_us) << 3; + delay = (rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; @@ -405,7 +424,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) hystart_update(sk, delay); } -static struct tcp_congestion_ops cubictcp = { +static struct tcp_congestion_ops cubictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, @@ -447,6 +466,10 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); + /* hystart needs ms clock resolution */ + if (hystart && HZ < 1000) + cubictcp.flags |= TCP_CONG_RTT_STAMP; + return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b6caaf75bb9..30f27f6b3655 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_highspeed = { +static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 7c94a4955416..c1a8175361e8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state) } } -static struct tcp_congestion_ops htcp = { +static struct tcp_congestion_ops htcp __read_mostly = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, .cong_avoid = htcp_cong_avoid, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 377bc9349371..fe3ecf484b44 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); } -static struct tcp_congestion_ops tcp_hybla = { +static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 00ca688d8964..813b43a76fec 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } } -static struct tcp_congestion_ops tcp_illinois = { +static struct tcp_congestion_ops tcp_illinois __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 65f6c0406245..bef9f04c22ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -2659,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const int undo) +static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); @@ -2671,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2699,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else @@ -2726,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } @@ -2779,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, 0); + tcp_undo_cwr(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -2808,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; @@ -2822,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk) static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; + /* Do not moderate cwnd if it's already undone in cwr or recovery */ + if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; + } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -3350,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); - else if (ca_seq_rtt > 0) + else if (ca_seq_rtt >= 0) rtt_us = jiffies_to_usecs(ca_seq_rtt); } @@ -3494,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); } /* F-RTO spurious RTO detection algorithm (RFC4138) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02f583b3744a..f7e6c2c2d2bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + __be16 orig_sport, orig_dport; struct rtable *rt; __be32 daddr, nexthop; - int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) @@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = inet->opt->faddr; } - tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->inet_sport, usin->sin_port, sk, 1); - if (tmp < 0) { - if (tmp == -ENETUNREACH) + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + rt = ip_route_connect(nexthop, inet->inet_saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + orig_sport, orig_dport, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); - return tmp; + return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { @@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - err = ip_route_newports(&rt, IPPROTO_TCP, - inet->inet_sport, inet->inet_dport, sk); - if (err) + rt = ip_route_newports(rt, IPPROTO_TCP, + orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto failure; - + } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); @@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->daddr.a4 == saddr) { + peer->daddr.addr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > @@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb->rxhash); - TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; } @@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb->rxhash); - - TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; reset: diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index de870377fbba..656d431c99ad 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) lp->last_drop = tcp_time_stamp; } -static struct tcp_congestion_ops tcp_lp = { +static struct tcp_congestion_ops tcp_lp __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a76513779e2b..8ce55b8aaec8 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_scalable = { +static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74a6aa003657..ecd44b0c45f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data) tcp_send_ack(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } - TCP_CHECK_TIMER(sk); out: if (tcp_memory_pressure) @@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data) tcp_probe_timer(sk); break; } - TCP_CHECK_TIMER(sk); out: sk_mem_reclaim(sk); @@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp) - elapsed; } - TCP_CHECK_TIMER(sk); sk_mem_reclaim(sk); resched: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c6743eec9b7d..80fa2bfd7ede 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); -static struct tcp_congestion_ops tcp_vegas = { +static struct tcp_congestion_ops tcp_vegas __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 38bc0b52d745..ac43cd747bce 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static struct tcp_congestion_ops tcp_veno = { +static struct tcp_congestion_ops tcp_veno __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index a534dda5456e..1b91bf48e277 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, } -static struct tcp_congestion_ops tcp_westwood = { +static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a0f240358892..dc7f43179c9a 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { return tp->snd_cwnd - reduction; } -static struct tcp_congestion_ops tcp_yeah = { +static struct tcp_congestion_ops tcp_yeah __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17959ee..588f47af5faf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk) EXPORT_SYMBOL(udp_flush_pending_frames); /** - * udp4_hwcsum_outgoing - handle outgoing HW checksumming - * @sk: socket we are sending on + * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) + * @src: source IP address + * @dst: destination IP address */ -static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len) +static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { - unsigned int offset; struct udphdr *uh = udp_hdr(skb); + struct sk_buff *frags = skb_shinfo(skb)->frag_list; + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + int hlen = len; __wsum csum = 0; - if (skb_queue_len(&sk->sk_write_queue) == 1) { + if (!frags) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); + uh->check = ~csum_tcpudp_magic(src, dst, len, + IPPROTO_UDP, 0); } else { /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - offset = skb_transport_offset(skb); - skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + do { + csum = csum_add(csum, frags->csum); + hlen -= frags->len; + } while ((frags = frags->next)); + csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; - skb_queue_walk(&sk->sk_write_queue, skb) { - csum = csum_add(csum, skb->csum); - } - uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } -/* - * Push out all pending data as one UDP datagram. Socket is locked. - */ -static int udp_push_pending_frames(struct sock *sk) +static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport) { - struct udp_sock *up = udp_sk(sk); + struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct flowi *fl = &inet->cork.fl; - struct sk_buff *skb; struct udphdr *uh; + struct rtable *rt = (struct rtable *)skb_dst(skb); int err = 0; int is_udplite = IS_UDPLITE(sk); + int offset = skb_transport_offset(skb); + int len = skb->len - offset; __wsum csum = 0; - /* Grab the skbuff where UDP header space exists. */ - if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) - goto out; - /* * Create a UDP header */ uh = udp_hdr(skb); - uh->source = fl->fl_ip_sport; - uh->dest = fl->fl_ip_dport; - uh->len = htons(up->len); + uh->source = inet->inet_sport; + uh->dest = dport; + uh->len = htons(len); uh->check = 0; if (is_udplite) /* UDP-Lite */ - csum = udplite_csum_outgoing(sk, skb); + csum = udplite_csum(skb); else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ @@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); + udp4_hwcsum(skb, rt->rt_src, daddr); goto send; - } else /* `normal' UDP */ - csum = udp_csum_outgoing(sk, skb); + } else + csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, + uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: - err = ip_push_pending_frames(sk); + err = ip_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), @@ -763,6 +760,26 @@ send: } else UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); + return err; +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; + struct sk_buff *skb; + int err = 0; + + skb = ip_finish_skb(sk); + if (!skb) + goto out; + + err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport); + out: up->len = 0; up->pending = 0; @@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; @@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sk_buff *skb; if (len > 0xFFFF) return -EMSGSIZE; @@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + if (up->pending) { /* * There are pending frames. @@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = faddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = dport }; + struct flowi4 fl4 = { + .flowi4_oif = ipc.oif, + .flowi4_mark = sk->sk_mark, + .daddr = faddr, + .saddr = saddr, + .flowi4_tos = tos, + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = (inet_sk_flowi_flags(sk) | + FLOWI_FLAG_CAN_SLEEP), + .fl4_sport = inet->inet_sport, + .fl4_dport = dport, + }; struct net *net = sock_net(sk); - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(net, &rt, &fl, sk, 1); - if (err) { + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; @@ -923,6 +949,17 @@ back_from_confirm: if (!ipc.addr) daddr = ipc.addr = rt->rt_dst; + /* Lockless fast path for the non-corking case. */ + if (!corkreq) { + skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + msg->msg_flags); + err = PTR_ERR(skb); + if (skb && !IS_ERR(skb)) + err = udp_send_skb(skb, daddr, dport); + goto out; + } + lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ @@ -936,15 +973,15 @@ back_from_confirm: /* * Now cork the socket to pend data. */ - inet->cork.fl.fl4_dst = daddr; - inet->cork.fl.fl_ip_dport = dport; - inet->cork.fl.fl4_src = saddr; - inet->cork.fl.fl_ip_sport = inet->inet_sport; + fl4 = &inet->cork.fl.u.ip4; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->fl4_dport = dport; + fl4->fl4_sport = inet->inet_sport; up->pending = AF_INET; do_append_data: up->len += ulen; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); @@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b057d40addec..13e0e7f659ff 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -19,25 +19,23 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, - xfrm_address_t *saddr, - xfrm_address_t *daddr) + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi fl = { - .fl4_dst = daddr->a4, - .fl4_tos = tos, + struct flowi4 fl4 = { + .daddr = daddr->a4, + .flowi4_tos = tos, }; - struct dst_entry *dst; struct rtable *rt; - int err; if (saddr) - fl.fl4_src = saddr->a4; + fl4.saddr = saddr->a4; + + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) + return &rt->dst; - err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->dst; - if (err) - dst = ERR_PTR(err); - return dst; + return ERR_CAST(rt); } static int xfrm4_get_saddr(struct net *net, @@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net, return 0; } -static int xfrm4_get_tos(struct flowi *fl) +static int xfrm4_get_tos(const struct flowi *fl) { - return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ + return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ } static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -68,11 +66,17 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - struct flowi *fl) + const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; + const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.fl = *fl; + rt->rt_key_dst = fl4->daddr; + rt->rt_key_src = fl4->saddr; + rt->rt_tos = fl4->flowi4_tos; + rt->rt_iif = fl4->flowi4_iif; + rt->rt_oif = fl4->flowi4_oif; + rt->rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); @@ -99,9 +103,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + struct flowi4 *fl4 = &fl->u.ip4; - memset(fl, 0, sizeof(struct flowi)); - fl->mark = skb->mark; + memset(fl4, 0, sizeof(struct flowi4)); + fl4->flowi4_mark = skb->mark; if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { switch (iph->protocol) { @@ -114,8 +119,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports = (__be16 *)xprth; - fl->fl_ip_sport = ports[!!reverse]; - fl->fl_ip_dport = ports[!reverse]; + fl4->fl4_sport = ports[!!reverse]; + fl4->fl4_dport = ports[!reverse]; } break; @@ -123,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp = xprth; - fl->fl_icmp_type = icmp[0]; - fl->fl_icmp_code = icmp[1]; + fl4->fl4_icmp_type = icmp[0]; + fl4->fl4_icmp_code = icmp[1]; } break; @@ -132,7 +137,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr = (__be32 *)xprth; - fl->fl_ipsec_spi = ehdr[0]; + fl4->fl4_ipsec_spi = ehdr[0]; } break; @@ -140,7 +145,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr = (__be32*)xprth; - fl->fl_ipsec_spi = ah_hdr[1]; + fl4->fl4_ipsec_spi = ah_hdr[1]; } break; @@ -148,7 +153,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr = (__be16 *)xprth; - fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); + fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; @@ -160,20 +165,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; - fl->fl_gre_key = gre_hdr[1]; + fl4->fl4_gre_key = gre_hdr[1]; } } break; default: - fl->fl_ipsec_spi = 0; + fl4->fl4_ipsec_spi = 0; break; } } - fl->proto = iph->protocol; - fl->fl4_dst = reverse ? iph->saddr : iph->daddr; - fl->fl4_src = reverse ? iph->daddr : iph->saddr; - fl->fl4_tos = iph->tos; + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; } static inline int xfrm4_garbage_collect(struct dst_ops *ops) @@ -196,8 +201,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt.peer)) inet_putpeer(xdst->u.rt.peer); + xfrm_dst_destroy(xdst); } @@ -215,6 +223,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, @@ -230,6 +239,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, + .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 47947624eccc..1717c64628d1 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x) } static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { - sel->daddr.a4 = fl->fl4_dst; - sel->saddr.a4 = fl->fl4_src; - sel->dport = xfrm_flowi_dport(fl); + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET; sel->prefixlen_d = 32; sel->prefixlen_s = 32; - sel->proto = fl->proto; - sel->ifindex = fl->oif; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; } static void -xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) { x->id = tmpl->id; if (x->id.daddr.a4 == 0) |