From 90ba9b1986b5ac4b2d184575847147ea7c4280a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 Jun 2012 19:50:43 +0000
Subject: tcp: tcp_make_synack() can use alloc_skb()

There is no value using sock_wmalloc() in tcp_make_synack().

A listener socket only sends SYNACK packets, they are not queued in a
socket queue, only in Qdisc and device layers, so the number of in
flight packets is limited in these layers. We used sock_wmalloc() with
the %force parameter set to 1 to ignore socket limits anyway.

This patch removes two atomic operations per SYNACK packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 803cbfe82fbc..f0b0e4414b00 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2461,7 +2461,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
 		s_data_desired = cvp->s_data_desired;
-	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
+	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
 	if (skb == NULL)
 		return NULL;
 
-- 
cgit v1.2.3


From 4aea39c11c610e411768649fdc04777903ebfe07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 Jun 2012 20:33:21 +0000
Subject: tcp: tcp_make_synack() consumes dst parameter

tcp_make_synack() clones the dst, and callers release it.

We can avoid two atomic operations per SYNACK if tcp_make_synack()
consumes dst instead of cloning it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c   |  1 -
 net/ipv4/tcp_output.c | 18 ++++++++++++++----
 net/ipv6/tcp_ipv6.c   |  1 -
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c8d28c433b2b..3d9c1a4b8819 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -848,7 +848,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 		err = net_xmit_eval(err);
 	}
 
-	dst_release(dst);
 	return err;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f0b0e4414b00..c465d3e51e28 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2442,7 +2442,16 @@ int tcp_send_synack(struct sock *sk)
 	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
 
-/* Prepare a SYN-ACK. */
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ * rvp: request_values pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				struct request_sock *req,
 				struct request_values *rvp)
@@ -2462,13 +2471,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
 		s_data_desired = cvp->s_data_desired;
 	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
-	if (skb == NULL)
+	if (unlikely(!skb)) {
+		dst_release(dst);
 		return NULL;
-
+	}
 	/* Reserve space for headers. */
 	skb_reserve(skb, MAX_TCP_HEADER);
 
-	skb_dst_set(skb, dst_clone(dst));
+	skb_dst_set(skb, dst);
 
 	mss = dst_metric_advmss(dst);
 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3a9aec29581a..80758255556c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -522,7 +522,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 done:
 	if (opt && opt != np->opt)
 		sock_kfree_s(sk, opt, opt->tot_len);
-	dst_release(dst);
 	return err;
 }
 
-- 
cgit v1.2.3


From 5d0ba55b6486f58cc890918d7167063d83f7fbb4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Jun 2012 01:17:19 +0000
Subject: net: use consume_skb() in place of kfree_skb()

Remove some dropwatch/drop_monitor false positives.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/lec.c                   | 6 ++++--
 net/atm/pppoatm.c               | 2 +-
 net/ax25/ax25_out.c             | 2 +-
 net/ax25/ax25_route.c           | 2 +-
 net/decnet/dn_neigh.c           | 6 +++---
 net/ipv4/ip_output.c            | 4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
 7 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/atm/lec.c b/net/atm/lec.c
index a7d172105c99..3da125c384ea 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -231,9 +231,11 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
 	if (skb_headroom(skb) < 2) {
 		pr_debug("reallocating skb\n");
 		skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
-		kfree_skb(skb);
-		if (skb2 == NULL)
+		if (unlikely(!skb2)) {
+			kfree_skb(skb);
 			return NETDEV_TX_OK;
+		}
+		consume_skb(skb);
 		skb = skb2;
 	}
 	skb_push(skb, 2);
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index ce1e59fdae7b..226dca989448 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -283,7 +283,7 @@ static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb)
 				kfree_skb(n);
 				goto nospace;
 			}
-			kfree_skb(skb);
+			consume_skb(skb);
 			skb = n;
 			if (skb == NULL)
 				return DROP_PACKET;
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index be8a25e0db65..be2acab9be9d 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -350,7 +350,7 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
 		if (skb->sk != NULL)
 			skb_set_owner_w(skbn, skb->sk);
 
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = skbn;
 	}
 
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index a65588040b9e..d39097737e38 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -474,7 +474,7 @@ struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
 		if (skb->sk != NULL)
 			skb_set_owner_w(skbn, skb->sk);
 
-		kfree_skb(skb);
+		consume_skb(skb);
 
 		skb = skbn;
 	}
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index ac90f658586c..8e9a35b17df4 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -240,7 +240,7 @@ static int dn_long_output(struct neighbour *neigh, struct sk_buff *skb)
 			kfree_skb(skb);
 			return -ENOBUFS;
 		}
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = skb2;
 		net_info_ratelimited("dn_long_output: Increasing headroom\n");
 	}
@@ -283,7 +283,7 @@ static int dn_short_output(struct neighbour *neigh, struct sk_buff *skb)
 			kfree_skb(skb);
 			return -ENOBUFS;
 		}
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = skb2;
 		net_info_ratelimited("dn_short_output: Increasing headroom\n");
 	}
@@ -322,7 +322,7 @@ static int dn_phase3_output(struct neighbour *neigh, struct sk_buff *skb)
 			kfree_skb(skb);
 			return -ENOBUFS;
 		}
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = skb2;
 		net_info_ratelimited("dn_phase3_output: Increasing headroom\n");
 	}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451f97c42eb4..b99ca4e154b9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -200,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		}
 		if (skb->sk)
 			skb_set_owner_w(skb2, skb->sk);
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = skb2;
 	}
 
@@ -709,7 +709,7 @@ slow_path:
 
 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 	}
-	kfree_skb(skb);
+	consume_skb(skb);
 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 	return err;
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 7fd66dec859d..71d6ecb65926 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -823,7 +823,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_ERR_RL("%s(): no memory\n", __func__);
 			return NF_STOLEN;
 		}
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = new_skb;
 		old_iph = ip_hdr(skb);
 	}
@@ -942,7 +942,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			IP_VS_ERR_RL("%s(): no memory\n", __func__);
 			return NF_STOLEN;
 		}
-		kfree_skb(skb);
+		consume_skb(skb);
 		skb = new_skb;
 		old_iph = ipv6_hdr(skb);
 	}
-- 
cgit v1.2.3


From e3192690a3c889767d1161b228374f4926d92af0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 3 Jun 2012 17:41:40 +0000
Subject: net: Remove casts to same type

Adding casts of objects to the same type is unnecessary
and confusing for a human reader.

For example, this cast:

	int y;
	int *p = (int *)&y;

I used the coccinelle script below to find and remove these
unnecessary casts.  I manually removed the conversions this
script produces of casts with __force and __user.

@@
type T;
T *p;
@@

-	(T *)p
+	p

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/9p/client.c                        |  2 +-
 net/atm/lec.c                          |  2 +-
 net/decnet/dn_nsp_out.c                |  2 +-
 net/ipv4/af_inet.c                     |  2 +-
 net/ipv4/fib_trie.c                    | 13 ++++++-------
 net/ipv4/netfilter/nf_nat_snmp_basic.c |  4 ++--
 net/ipv6/exthdrs.c                     |  4 ++--
 net/irda/irqueue.c                     |  6 +++---
 net/l2tp/l2tp_ppp.c                    |  8 ++++----
 net/mac80211/scan.c                    |  3 +--
 net/netfilter/nf_conntrack_core.c      |  2 +-
 net/packet/af_packet.c                 |  9 ++++-----
 net/tipc/port.c                        |  9 ++++-----
 net/tipc/socket.c                      |  2 +-
 14 files changed, 32 insertions(+), 36 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/9p/client.c b/net/9p/client.c
index a170893d70e0..5cbea903a5ab 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1548,7 +1548,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 			kernel_buf = 1;
 			indata = data;
 		} else
-			indata = (char *)udata;
+			indata = udata;
 		/*
 		 * response header len is 11
 		 * PDU Header(7) + IO Size (4)
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 3da125c384ea..2e3d942e77f1 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1604,7 +1604,7 @@ static void lec_arp_expire_vcc(unsigned long data)
 {
 	unsigned long flags;
 	struct lec_arp_table *to_remove = (struct lec_arp_table *)data;
-	struct lec_priv *priv = (struct lec_priv *)to_remove->priv;
+	struct lec_priv *priv = to_remove->priv;
 
 	del_timer(&to_remove->timer);
 
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 564a6ad13ce7..8a96047c7c94 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -322,7 +322,7 @@ static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned c
 	/* Set "cross subchannel" bit in ackcrs */
 	ackcrs |= 0x2000;
 
-	ptr = (__le16 *)dn_mk_common_header(scp, skb, msgflag, hlen);
+	ptr = dn_mk_common_header(scp, skb, msgflag, hlen);
 
 	*ptr++ = cpu_to_le16(acknum);
 	*ptr++ = cpu_to_le16(ackcrs);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c8f7aee587d1..e4e8e00a2c91 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -553,7 +553,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 
 	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
-	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+	return sk->sk_prot->connect(sk, uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_dgram_connect);
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30b88d7b4bd6..18cbc15b20d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-		tn = (struct tnode *) resize(t, (struct tnode *)tn);
+		tn = (struct tnode *)resize(t, tn);
 
-		tnode_put_child_reorg((struct tnode *)tp, cindex,
+		tnode_put_child_reorg(tp, cindex,
 				      (struct rt_trie_node *)tn, wasfull);
 
 		tp = node_parent((struct rt_trie_node *) tn);
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
 
 	/* Handle last (top) tnode */
 	if (IS_TNODE(tn))
-		tn = (struct tnode *)resize(t, (struct tnode *)tn);
+		tn = (struct tnode *)resize(t, tn);
 
 	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
 	tnode_free_flush();
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 		node_set_parent((struct rt_trie_node *)l, tp);
 
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
+		put_child(t, tp, cindex, (struct rt_trie_node *)l);
 	} else {
 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 		/*
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(t, (struct tnode *)tp, cindex,
-				  (struct rt_trie_node *)tn);
+			put_child(t, tp, cindex, (struct rt_trie_node *)tn);
 		} else {
 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
 			tp = tn;
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
 
 	if (tp) {
 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
-		put_child(t, (struct tnode *)tp, cindex, NULL);
+		put_child(t, tp, cindex, NULL);
 		trie_rebalance(t, tp);
 	} else
 		RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 746edec8b86e..bac712293fd6 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
 
 	ptr = *octets;
 	while (ctx->pointer < eoc) {
-		if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+		if (!asn1_octet_decode(ctx, ptr++)) {
 			kfree(*octets);
 			*octets = NULL;
 			return 0;
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
 		}
 		break;
 	case SNMP_OBJECTID:
-		if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+		if (!asn1_oid_decode(ctx, end, &lp, &len)) {
 			kfree(id);
 			return 0;
 		}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 6447dc49429f..fa3d9c328092 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -791,14 +791,14 @@ static int ipv6_renew_option(void *ohdr,
 		if (ohdr) {
 			memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
 			*hdr = (struct ipv6_opt_hdr *)*p;
-			*p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr));
+			*p += CMSG_ALIGN(ipv6_optlen(*hdr));
 		}
 	} else {
 		if (newopt) {
 			if (copy_from_user(*p, newopt, newoptlen))
 				return -EFAULT;
 			*hdr = (struct ipv6_opt_hdr *)*p;
-			if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen)
+			if (ipv6_optlen(*hdr) > newoptlen)
 				return -EINVAL;
 			*p += CMSG_ALIGN(newoptlen);
 		}
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index f06947c4fa82..7152624ed5f1 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -523,7 +523,7 @@ void *hashbin_remove_first( hashbin_t *hashbin)
 		 * Dequeue the entry...
 		 */
 		dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
-				 (irda_queue_t*) entry );
+				 entry);
 		hashbin->hb_size--;
 		entry->q_next = NULL;
 		entry->q_prev = NULL;
@@ -615,7 +615,7 @@ void* hashbin_remove( hashbin_t* hashbin, long hashv, const char* name)
 	 */
 	if ( found ) {
 		dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
-				 (irda_queue_t*) entry );
+				 entry);
 		hashbin->hb_size--;
 
 		/*
@@ -685,7 +685,7 @@ void* hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry)
 	 * Dequeue the entry...
 	 */
 	dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
-			 (irda_queue_t*) entry );
+			 entry);
 	hashbin->hb_size--;
 	entry->q_next = NULL;
 	entry->q_prev = NULL;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 8ef6b9416cba..286366ef8930 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1522,8 +1522,8 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
  * handler, according to whether the PPPoX socket is a for a regular session
  * or the special tunnel type.
  */
-static int pppol2tp_getsockopt(struct socket *sock, int level,
-			       int optname, char __user *optval, int __user *optlen)
+static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, int __user *optlen)
 {
 	struct sock *sk = sock->sk;
 	struct l2tp_session *session;
@@ -1535,7 +1535,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level,
 	if (level != SOL_PPPOL2TP)
 		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
 
-	if (get_user(len, (int __user *) optlen))
+	if (get_user(len, optlen))
 		return -EFAULT;
 
 	len = min_t(unsigned int, len, sizeof(int));
@@ -1568,7 +1568,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level,
 		err = pppol2tp_session_getsockopt(sk, session, optname, &val);
 
 	err = -EFAULT;
-	if (put_user(len, (int __user *) optlen))
+	if (put_user(len, optlen))
 		goto end_put_sess;
 
 	if (copy_to_user((void __user *) optval, &val, len))
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 169da0742c81..6d90a562669f 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -114,8 +114,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 
 	if (elems->tim && (!elems->parse_error ||
 			   !(bss->valid_data & IEEE80211_BSS_VALID_DTIM))) {
-		struct ieee80211_tim_ie *tim_ie =
-			(struct ieee80211_tim_ie *)elems->tim;
+		struct ieee80211_tim_ie *tim_ie = elems->tim;
 		bss->dtim_period = tim_ie->dtim_period;
 		if (!elems->parse_error)
 				bss->valid_data |= IEEE80211_BSS_VALID_DTIM;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac3af97cc468..95976a593b98 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -531,7 +531,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp) {
 		if (skb->tstamp.tv64 == 0)
-			__net_timestamp((struct sk_buff *)skb);
+			__net_timestamp(skb);
 
 		tstamp->start = ktime_to_ns(skb->tstamp);
 	}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f661745df0f..71ac6559e0c6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -592,7 +592,7 @@ static void init_prb_bdqc(struct packet_sock *po,
 	p1->knxt_seq_num = 1;
 	p1->pkbdq = pg_vec;
 	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
-	p1->pkblk_start	= (char *)pg_vec[0].buffer;
+	p1->pkblk_start	= pg_vec[0].buffer;
 	p1->kblk_size = req_u->req3.tp_block_size;
 	p1->knum_blocks	= req_u->req3.tp_block_nr;
 	p1->hdrlen = po->tp_hdrlen;
@@ -824,8 +824,7 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 		h1->ts_first_pkt.ts_sec = ts.tv_sec;
 		h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
 		pkc1->pkblk_start = (char *)pbd1;
-		pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
-		BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
+		pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 		BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 		BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
 		pbd1->version = pkc1->version;
@@ -1018,7 +1017,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po,
 	struct tpacket_block_desc *pbd;
 	char *curr, *end;
 
-	pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
+	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 
 	/* Queue is frozen when user space is lagging behind */
@@ -1044,7 +1043,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po,
 	smp_mb();
 	curr = pkc->nxt_offset;
 	pkc->skb = skb;
-	end = (char *) ((char *)pbd + pkc->kblk_size);
+	end = (char *)pbd + pkc->kblk_size;
 
 	/* first try the current block */
 	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 2ad37a4db376..a1e828989d7a 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -909,8 +909,8 @@ int tipc_createport(void *usr_handle,
 		warn("Port creation failed, no memory\n");
 		return -ENOMEM;
 	}
-	p_ptr = (struct tipc_port *)tipc_createport_raw(NULL, port_dispatcher,
-						   port_wakeup, importance);
+	p_ptr = tipc_createport_raw(NULL, port_dispatcher, port_wakeup,
+				    importance);
 	if (!p_ptr) {
 		kfree(up_ptr);
 		return -ENOMEM;
@@ -1078,8 +1078,7 @@ int tipc_disconnect_port(struct tipc_port *tp_ptr)
 	if (tp_ptr->connected) {
 		tp_ptr->connected = 0;
 		/* let timer expire on it's own to avoid deadlock! */
-		tipc_nodesub_unsubscribe(
-			&((struct tipc_port *)tp_ptr)->subscription);
+		tipc_nodesub_unsubscribe(&tp_ptr->subscription);
 		res = 0;
 	} else {
 		res = -ENOTCONN;
@@ -1099,7 +1098,7 @@ int tipc_disconnect(u32 ref)
 	p_ptr = tipc_port_lock(ref);
 	if (!p_ptr)
 		return -EINVAL;
-	res = tipc_disconnect_port((struct tipc_port *)p_ptr);
+	res = tipc_disconnect_port(p_ptr);
 	tipc_port_unlock(p_ptr);
 	return res;
 }
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 5577a447f531..11a863d81421 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -54,7 +54,7 @@ struct tipc_sock {
 };
 
 #define tipc_sk(sk) ((struct tipc_sock *)(sk))
-#define tipc_sk_port(sk) ((struct tipc_port *)(tipc_sk(sk)->p))
+#define tipc_sk_port(sk) (tipc_sk(sk)->p)
 
 #define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \
 			(sock->state == SS_DISCONNECTING))
-- 
cgit v1.2.3


From 2c352f444ccfa966a1aa4fd8e9ee29381c467448 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitus.com>
Date: Mon, 28 May 2012 21:04:09 +0000
Subject: netfilter: nf_conntrack: prepare namespace support for l4 protocol
 trackers

This patch prepares the namespace support for layer 4 protocol trackers.
Basically, this modifies the following interfaces:

* nf_ct_[un]register_sysctl
* nf_conntrack_l4proto_[un]register

to include the namespace parameter. We still use init_net in this patch
to prepare the ground for follow-up patches for each layer 4 protocol
tracker.

We add a new net_id field to struct nf_conntrack_l4proto that is used
to store the pernet_operations id for each layer 4 protocol tracker.

Note that AF_INET6's protocols do not need to do sysctl compat. Thus,
we only register compat sysctl when l4proto.l3proto != AF_INET6.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  11 +-
 include/net/netns/conntrack.h                  |  12 +++
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  18 ++--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  18 ++--
 net/netfilter/nf_conntrack_proto.c             | 143 ++++++++++++++++++-------
 net/netfilter/nf_conntrack_proto_dccp.c        |  10 +-
 net/netfilter/nf_conntrack_proto_gre.c         |   6 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |  10 +-
 net/netfilter/nf_conntrack_proto_udplite.c     |  10 +-
 9 files changed, 159 insertions(+), 79 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 3b572bb20aa2..d621c91de5c8 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -12,6 +12,7 @@
 #include <linux/netlink.h>
 #include <net/netlink.h>
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netns/generic.h>
 
 struct seq_file;
 
@@ -103,6 +104,10 @@ struct nf_conntrack_l4proto {
 	struct ctl_table	*ctl_compat_table;
 #endif
 #endif
+	int	*net_id;
+	/* Init l4proto pernet data */
+	int (*init_net)(struct net *net);
+
 	/* Protocol name */
 	const char *name;
 
@@ -123,8 +128,10 @@ nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto);
 extern void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p);
 
 /* Protocol registration. */
-extern int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *proto);
-extern void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *proto);
+extern int nf_conntrack_l4proto_register(struct net *net,
+					 struct nf_conntrack_l4proto *proto);
+extern void nf_conntrack_l4proto_unregister(struct net *net,
+					    struct nf_conntrack_l4proto *proto);
 
 /* Generic netlink helpers */
 extern int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index a053a19870cf..1f53038b0d1b 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -8,6 +8,18 @@
 struct ctl_table_header;
 struct nf_conntrack_ecache;
 
+struct nf_proto_net {
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header *ctl_table_header;
+	struct ctl_table        *ctl_table;
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	struct ctl_table_header *ctl_compat_header;
+	struct ctl_table        *ctl_compat_table;
+#endif
+#endif
+	unsigned int		users;
+};
+
 struct netns_ct {
 	atomic_t		count;
 	unsigned int		expect_count;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 91747d4ebc26..46ec515db129 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -391,19 +391,19 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 		return ret;
 	}
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_tcp4);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv4: can't register tcp.\n");
 		goto cleanup_sockopt;
 	}
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_udp4);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv4: can't register udp.\n");
 		goto cleanup_tcp;
 	}
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_icmp);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv4: can't register icmp.\n");
 		goto cleanup_udp;
@@ -434,11 +434,11 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
  cleanup_ipv4:
 	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
  cleanup_icmp:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
  cleanup_udp:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp4);
  cleanup_tcp:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp4);
  cleanup_sockopt:
 	nf_unregister_sockopt(&so_getorigdst);
 	return ret;
@@ -452,9 +452,9 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
 #endif
 	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
 	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp4);
 	nf_unregister_sockopt(&so_getorigdst);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 3224ef90a21a..51ad9f104421 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -340,19 +340,19 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 	need_conntrack();
 	nf_defrag_ipv6_enable();
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_tcp6);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv6: can't register tcp.\n");
 		return ret;
 	}
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_udp6);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv6: can't register udp.\n");
 		goto cleanup_tcp;
 	}
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_icmpv6);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv6: can't register icmpv6.\n");
 		goto cleanup_udp;
@@ -376,11 +376,11 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
  cleanup_ipv6:
 	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
  cleanup_icmpv6:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmpv6);
  cleanup_udp:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp6);
  cleanup_tcp:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp6);
 	return ret;
 }
 
@@ -389,9 +389,9 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
 	synchronize_net();
 	nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
 	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmpv6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp6);
 }
 
 module_init(nf_conntrack_l3proto_ipv6_init);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 8b631b07a645..7ee31ac0a12c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -36,28 +36,35 @@ static DEFINE_MUTEX(nf_ct_proto_mutex);
 
 #ifdef CONFIG_SYSCTL
 static int
-nf_ct_register_sysctl(struct ctl_table_header **header, const char *path,
-		      struct ctl_table *table, unsigned int *users)
+nf_ct_register_sysctl(struct net *net,
+		      struct ctl_table_header **header,
+		      const char *path,
+		      struct ctl_table *table,
+		      unsigned int *users)
 {
 	if (*header == NULL) {
-		*header = register_net_sysctl(&init_net, path, table);
+		*header = register_net_sysctl(net, path, table);
 		if (*header == NULL)
 			return -ENOMEM;
 	}
 	if (users != NULL)
 		(*users)++;
+
 	return 0;
 }
 
 static void
 nf_ct_unregister_sysctl(struct ctl_table_header **header,
-			struct ctl_table *table, unsigned int *users)
+			struct ctl_table **table,
+			unsigned int *users)
 {
 	if (users != NULL && --*users > 0)
 		return;
 
 	unregister_net_sysctl_table(*header);
+	kfree(*table);
 	*header = NULL;
+	*table = NULL;
 }
 #endif
 
@@ -167,7 +174,8 @@ static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
 
 #ifdef CONFIG_SYSCTL
 	if (l3proto->ctl_table != NULL) {
-		err = nf_ct_register_sysctl(&l3proto->ctl_table_header,
+		err = nf_ct_register_sysctl(&init_net,
+					    &l3proto->ctl_table_header,
 					    l3proto->ctl_table_path,
 					    l3proto->ctl_table, NULL);
 	}
@@ -180,7 +188,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto
 #ifdef CONFIG_SYSCTL
 	if (l3proto->ctl_table_header != NULL)
 		nf_ct_unregister_sysctl(&l3proto->ctl_table_header,
-					l3proto->ctl_table, NULL);
+					&l3proto->ctl_table, NULL);
 #endif
 }
 
@@ -243,29 +251,54 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
 
-static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto)
+static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
+					      struct nf_conntrack_l4proto *l4proto)
+{
+	if (l4proto->net_id)
+		return net_generic(net, *l4proto->net_id);
+	else
+		return NULL;
+}
+
+static
+int nf_ct_l4proto_register_sysctl(struct net *net,
+				  struct nf_conntrack_l4proto *l4proto)
 {
 	int err = 0;
+	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
+	if (pn == NULL)
+		return 0;
 
 #ifdef CONFIG_SYSCTL
-	if (l4proto->ctl_table != NULL) {
-		err = nf_ct_register_sysctl(l4proto->ctl_table_header,
+	if (pn->ctl_table != NULL) {
+		err = nf_ct_register_sysctl(net,
+					    &pn->ctl_table_header,
 					    "net/netfilter",
-					    l4proto->ctl_table,
-					    l4proto->ctl_table_users);
-		if (err < 0)
+					    pn->ctl_table,
+					    &pn->users);
+		if (err < 0) {
+			if (!pn->users) {
+				kfree(pn->ctl_table);
+				pn->ctl_table = NULL;
+			}
 			goto out;
+		}
 	}
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	if (l4proto->ctl_compat_table != NULL) {
-		err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header,
+	if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
+		err = nf_ct_register_sysctl(net,
+					    &pn->ctl_compat_header,
 					    "net/ipv4/netfilter",
-					    l4proto->ctl_compat_table, NULL);
+					    pn->ctl_compat_table,
+					    NULL);
 		if (err == 0)
 			goto out;
-		nf_ct_unregister_sysctl(l4proto->ctl_table_header,
-					l4proto->ctl_table,
-					l4proto->ctl_table_users);
+
+		kfree(pn->ctl_compat_table);
+		pn->ctl_compat_table = NULL;
+		nf_ct_unregister_sysctl(&pn->ctl_table_header,
+					&pn->ctl_table,
+					&pn->users);
 	}
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 out:
@@ -273,25 +306,34 @@ out:
 	return err;
 }
 
-static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto)
+static
+void nf_ct_l4proto_unregister_sysctl(struct net *net,
+				     struct nf_conntrack_l4proto *l4proto)
 {
+	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
+	if (pn == NULL)
+		return;
 #ifdef CONFIG_SYSCTL
-	if (l4proto->ctl_table_header != NULL &&
-	    *l4proto->ctl_table_header != NULL)
-		nf_ct_unregister_sysctl(l4proto->ctl_table_header,
-					l4proto->ctl_table,
-					l4proto->ctl_table_users);
+	if (pn->ctl_table_header != NULL)
+		nf_ct_unregister_sysctl(&pn->ctl_table_header,
+					&pn->ctl_table,
+					&pn->users);
+
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	if (l4proto->ctl_compat_table_header != NULL)
-		nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header,
-					l4proto->ctl_compat_table, NULL);
+	if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
+		nf_ct_unregister_sysctl(&pn->ctl_compat_header,
+					&pn->ctl_compat_table,
+					NULL);
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#else
+	pn->users--;
 #endif /* CONFIG_SYSCTL */
 }
 
 /* FIXME: Allow NULL functions and sub in pointers to generic for
    them. --RR */
-int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+static int
+nf_conntrack_l4proto_register_net(struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
 
@@ -333,10 +375,6 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
 		goto out_unlock;
 	}
 
-	ret = nf_ct_l4proto_register_sysctl(l4proto);
-	if (ret < 0)
-		goto out_unlock;
-
 	l4proto->nla_size = 0;
 	if (l4proto->nlattr_size)
 		l4proto->nla_size += l4proto->nlattr_size();
@@ -345,17 +383,34 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
 
 	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
 			   l4proto);
-
 out_unlock:
 	mutex_unlock(&nf_ct_proto_mutex);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
 
-void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+int nf_conntrack_l4proto_register(struct net *net,
+				  struct nf_conntrack_l4proto *l4proto)
 {
-	struct net *net;
+	int ret = 0;
+	if (net == &init_net)
+		ret = nf_conntrack_l4proto_register_net(l4proto);
+
+	if (ret < 0)
+		return ret;
+
+	if (l4proto->init_net)
+		ret = l4proto->init_net(net);
 
+	if (ret < 0)
+		return ret;
+
+	return nf_ct_l4proto_register_sysctl(net, l4proto);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
+
+static void
+nf_conntrack_l4proto_unregister_net(struct nf_conntrack_l4proto *l4proto)
+{
 	BUG_ON(l4proto->l3proto >= PF_MAX);
 
 	mutex_lock(&nf_ct_proto_mutex);
@@ -365,15 +420,21 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
 			) != l4proto);
 	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
 			   &nf_conntrack_l4proto_generic);
-	nf_ct_l4proto_unregister_sysctl(l4proto);
 	mutex_unlock(&nf_ct_proto_mutex);
 
 	synchronize_rcu();
+}
 
+void nf_conntrack_l4proto_unregister(struct net *net,
+				     struct nf_conntrack_l4proto *l4proto)
+{
+	if (net == &init_net)
+		nf_conntrack_l4proto_unregister_net(l4proto);
+
+	nf_ct_l4proto_unregister_sysctl(net, l4proto);
 	/* Remove all contrack entries for this protocol */
 	rtnl_lock();
-	for_each_net(net)
-		nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
+	nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
@@ -383,7 +444,7 @@ int nf_conntrack_proto_init(void)
 	unsigned int i;
 	int err;
 
-	err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic);
+	err = nf_ct_l4proto_register_sysctl(&init_net, &nf_conntrack_l4proto_generic);
 	if (err < 0)
 		return err;
 
@@ -397,7 +458,7 @@ void nf_conntrack_proto_fini(void)
 {
 	unsigned int i;
 
-	nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic);
+	nf_ct_l4proto_unregister_sysctl(&init_net, &nf_conntrack_l4proto_generic);
 
 	/* free l3proto protocol tables */
 	for (i = 0; i < PF_MAX; i++)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index ef706a485be1..5a8e03724289 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -945,17 +945,17 @@ static int __init nf_conntrack_proto_dccp_init(void)
 	if (err < 0)
 		goto err1;
 
-	err = nf_conntrack_l4proto_register(&dccp_proto4);
+	err = nf_conntrack_l4proto_register(&init_net, &dccp_proto4);
 	if (err < 0)
 		goto err2;
 
-	err = nf_conntrack_l4proto_register(&dccp_proto6);
+	err = nf_conntrack_l4proto_register(&init_net, &dccp_proto6);
 	if (err < 0)
 		goto err3;
 	return 0;
 
 err3:
-	nf_conntrack_l4proto_unregister(&dccp_proto4);
+	nf_conntrack_l4proto_unregister(&init_net, &dccp_proto4);
 err2:
 	unregister_pernet_subsys(&dccp_net_ops);
 err1:
@@ -965,8 +965,8 @@ err1:
 static void __exit nf_conntrack_proto_dccp_fini(void)
 {
 	unregister_pernet_subsys(&dccp_net_ops);
-	nf_conntrack_l4proto_unregister(&dccp_proto6);
-	nf_conntrack_l4proto_unregister(&dccp_proto4);
+	nf_conntrack_l4proto_unregister(&init_net, &dccp_proto6);
+	nf_conntrack_l4proto_unregister(&init_net, &dccp_proto4);
 }
 
 module_init(nf_conntrack_proto_dccp_init);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 4bf6b4e4b776..132f0d2d82cc 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -396,18 +396,18 @@ static int __init nf_ct_proto_gre_init(void)
 {
 	int rv;
 
-	rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
+	rv = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_gre4);
 	if (rv < 0)
 		return rv;
 	rv = register_pernet_subsys(&proto_gre_net_ops);
 	if (rv < 0)
-		nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+		nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_gre4);
 	return rv;
 }
 
 static void __exit nf_ct_proto_gre_fini(void)
 {
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_gre4);
 	unregister_pernet_subsys(&proto_gre_net_ops);
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 996db2fa21f7..97bbc20e1a2b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -791,12 +791,12 @@ static int __init nf_conntrack_proto_sctp_init(void)
 {
 	int ret;
 
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_sctp4);
 	if (ret) {
 		pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n");
 		goto out;
 	}
-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6);
+	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_sctp6);
 	if (ret) {
 		pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n");
 		goto cleanup_sctp4;
@@ -805,15 +805,15 @@ static int __init nf_conntrack_proto_sctp_init(void)
 	return ret;
 
  cleanup_sctp4:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_sctp4);
  out:
 	return ret;
 }
 
 static void __exit nf_conntrack_proto_sctp_fini(void)
 {
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_sctp6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_sctp4);
 }
 
 module_init(nf_conntrack_proto_sctp_init);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 4d60a5376aa6..fa142a81496c 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -299,23 +299,23 @@ static int __init nf_conntrack_proto_udplite_init(void)
 {
 	int err;
 
-	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4);
+	err = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_udplite4);
 	if (err < 0)
 		goto err1;
-	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6);
+	err = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_udplite6);
 	if (err < 0)
 		goto err2;
 	return 0;
 err2:
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udplite4);
 err1:
 	return err;
 }
 
 static void __exit nf_conntrack_proto_udplite_exit(void)
 {
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udplite6);
+	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udplite4);
 }
 
 module_init(nf_conntrack_proto_udplite_init);
-- 
cgit v1.2.3


From 524a53e5ad5f34f64ed34281e8b0eca19437db5b Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 28 May 2012 21:04:10 +0000
Subject: netfilter: nf_conntrack: prepare namespace support for l3 protocol
 trackers

This patch prepares the namespace support for layer 3 protocol trackers.
Basically, this modifies the following interfaces:

* nf_ct_l3proto_[un]register_sysctl.
* nf_conntrack_l3proto_[un]register.

We add a new nf_ct_l3proto_net is used to get the pernet data of l3proto.

This adds rhe new struct nf_ip_net that is used to store the sysctl header
and l3proto_ipv4,l4proto_tcp(6),l4proto_udp(6),l4proto_icmp(v6) because the
protos such tcp and tcp6 use the same data,so making nf_ip_net as a field
of netns_ct is the easiest way to manager it.

This patch also adds init_net to struct nf_conntrack_l3proto to initial
the layer 3 protocol pernet data.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h   |  9 ++-
 include/net/netns/conntrack.h                  |  8 +++
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  6 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  6 +-
 net/netfilter/nf_conntrack_proto.c             | 92 ++++++++++++++++++++------
 5 files changed, 91 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 9699c028b74b..d6df8c71a7fe 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -69,6 +69,9 @@ struct nf_conntrack_l3proto {
 	struct ctl_table	*ctl_table;
 #endif /* CONFIG_SYSCTL */
 
+	/* Init l3proto pernet data */
+	int (*init_net)(struct net *net);
+
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
@@ -76,8 +79,10 @@ struct nf_conntrack_l3proto {
 extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX];
 
 /* Protocol registration. */
-extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto);
-extern void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto);
+extern int nf_conntrack_l3proto_register(struct net *net,
+					 struct nf_conntrack_l3proto *proto);
+extern void nf_conntrack_l3proto_unregister(struct net *net,
+					    struct nf_conntrack_l3proto *proto);
 extern struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto);
 extern void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p);
 
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 1f53038b0d1b..b2dbcc5cd813 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -20,6 +20,13 @@ struct nf_proto_net {
 	unsigned int		users;
 };
 
+struct nf_ip_net {
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	struct ctl_table_header *ctl_table_header;
+	struct ctl_table	*ctl_table;
+#endif
+};
+
 struct netns_ct {
 	atomic_t		count;
 	unsigned int		expect_count;
@@ -40,6 +47,7 @@ struct netns_ct {
 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
 	int			sysctl_auto_assign_helper;
 	bool			auto_assign_helper_warned;
+	struct nf_ip_net	nf_ct_proto;
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_header;
 	struct ctl_table_header	*acct_sysctl_header;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 46ec515db129..0c0fb906c19d 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -409,7 +409,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 		goto cleanup_udp;
 	}
 
-	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
+	ret = nf_conntrack_l3proto_register(&init_net, &nf_conntrack_l3proto_ipv4);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv4: can't register ipv4\n");
 		goto cleanup_icmp;
@@ -432,7 +432,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
 #endif
  cleanup_ipv4:
-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv4);
  cleanup_icmp:
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
  cleanup_udp:
@@ -451,7 +451,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
 	nf_conntrack_ipv4_compat_fini();
 #endif
 	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv4);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp4);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp4);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 51ad9f104421..7334cbfd6003 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -358,7 +358,7 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 		goto cleanup_udp;
 	}
 
-	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
+	ret = nf_conntrack_l3proto_register(&init_net, &nf_conntrack_l3proto_ipv6);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv6: can't register ipv6\n");
 		goto cleanup_icmpv6;
@@ -374,7 +374,7 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 	return ret;
 
  cleanup_ipv6:
-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv6);
  cleanup_icmpv6:
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmpv6);
  cleanup_udp:
@@ -388,7 +388,7 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
 {
 	synchronize_net();
 	nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv6);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmpv6);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp6);
 	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 7ee31ac0a12c..a8daf0faadb7 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -168,31 +168,57 @@ static int kill_l4proto(struct nf_conn *i, void *data)
 	       nf_ct_l3num(i) == l4proto->l3proto;
 }
 
-static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
+static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
+					   struct nf_conntrack_l3proto *l3proto)
+{
+	if (l3proto->l3proto == PF_INET)
+		return &net->ct.nf_ct_proto;
+	else
+		return NULL;
+}
+
+static int nf_ct_l3proto_register_sysctl(struct net *net,
+					 struct nf_conntrack_l3proto *l3proto)
 {
 	int err = 0;
+	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+	/* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
+	if (in == NULL)
+		return 0;
 
-#ifdef CONFIG_SYSCTL
-	if (l3proto->ctl_table != NULL) {
-		err = nf_ct_register_sysctl(&init_net,
-					    &l3proto->ctl_table_header,
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	if (in->ctl_table != NULL) {
+		err = nf_ct_register_sysctl(net,
+					    &in->ctl_table_header,
 					    l3proto->ctl_table_path,
-					    l3proto->ctl_table, NULL);
+					    in->ctl_table,
+					    NULL);
+		if (err < 0) {
+			kfree(in->ctl_table);
+			in->ctl_table = NULL;
+		}
 	}
 #endif
 	return err;
 }
 
-static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto)
+static void nf_ct_l3proto_unregister_sysctl(struct net *net,
+					    struct nf_conntrack_l3proto *l3proto)
 {
-#ifdef CONFIG_SYSCTL
-	if (l3proto->ctl_table_header != NULL)
-		nf_ct_unregister_sysctl(&l3proto->ctl_table_header,
-					&l3proto->ctl_table, NULL);
+	struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+
+	if (in == NULL)
+		return;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	if (in->ctl_table_header != NULL)
+		nf_ct_unregister_sysctl(&in->ctl_table_header,
+					&in->ctl_table,
+					NULL);
 #endif
 }
 
-int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+static int
+nf_conntrack_l3proto_register_net(struct nf_conntrack_l3proto *proto)
 {
 	int ret = 0;
 	struct nf_conntrack_l3proto *old;
@@ -211,10 +237,6 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
 		goto out_unlock;
 	}
 
-	ret = nf_ct_l3proto_register_sysctl(proto);
-	if (ret < 0)
-		goto out_unlock;
-
 	if (proto->nlattr_tuple_size)
 		proto->nla_size = 3 * proto->nlattr_tuple_size();
 
@@ -223,13 +245,32 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
 out_unlock:
 	mutex_unlock(&nf_ct_proto_mutex);
 	return ret;
+
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
 
-void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+int nf_conntrack_l3proto_register(struct net *net,
+				  struct nf_conntrack_l3proto *proto)
 {
-	struct net *net;
+	int ret = 0;
+
+	if (net == &init_net)
+		ret = nf_conntrack_l3proto_register_net(proto);
 
+	if (ret < 0)
+		return ret;
+
+	if (proto->init_net) {
+		ret = proto->init_net(net);
+		if (ret < 0)
+			return ret;
+	}
+	return nf_ct_l3proto_register_sysctl(net, proto);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
+
+static void
+nf_conntrack_l3proto_unregister_net(struct nf_conntrack_l3proto *proto)
+{
 	BUG_ON(proto->l3proto >= AF_MAX);
 
 	mutex_lock(&nf_ct_proto_mutex);
@@ -238,15 +279,22 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 					 ) != proto);
 	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
 			   &nf_conntrack_l3proto_generic);
-	nf_ct_l3proto_unregister_sysctl(proto);
 	mutex_unlock(&nf_ct_proto_mutex);
 
 	synchronize_rcu();
+}
+
+void nf_conntrack_l3proto_unregister(struct net *net,
+				     struct nf_conntrack_l3proto *proto)
+{
+	if (net == &init_net)
+		nf_conntrack_l3proto_unregister_net(proto);
+
+	nf_ct_l3proto_unregister_sysctl(net, proto);
 
 	/* Remove all contrack entries for this protocol */
 	rtnl_lock();
-	for_each_net(net)
-		nf_ct_iterate_cleanup(net, kill_l3proto, proto);
+	nf_ct_iterate_cleanup(net, kill_l3proto, proto);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
-- 
cgit v1.2.3


From 4b626b9c5d35b4f99b073dc5d6457abddcbcf429 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 28 May 2012 21:04:14 +0000
Subject: netfilter: nf_ct_icmp: add namespace support

This patch adds namespace support for ICMP protocol tracker.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h                |  6 +++++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 38 +++++++++++++++++++++++++---
 net/netfilter/nf_conntrack_proto.c           |  2 ++
 3 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 7bd14ab8ce1c..3d8e9e3b08a6 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -45,10 +45,16 @@ struct nf_udp_net {
 	unsigned int timeouts[UDP_CT_MAX];
 };
 
+struct nf_icmp_net {
+	struct nf_proto_net pn;
+	unsigned int timeout;
+};
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
 	struct nf_udp_net	udp;
+	struct nf_icmp_net	icmp;
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
 	struct ctl_table_header *ctl_table_header;
 	struct ctl_table	*ctl_table;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 0847e373d33c..a0eabeb36b9f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,6 +23,11 @@
 
 static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
 
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+	return &net->ct.nf_ct_proto.icmp;
+}
+
 static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			      struct nf_conntrack_tuple *tuple)
 {
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s,
 
 static unsigned int *icmp_get_timeouts(struct net *net)
 {
-	return &nf_ct_icmp_timeout;
+	return &icmp_pernet(net)->timeout;
 }
 
 /* Returns verdict for packet, or -1 for invalid. */
@@ -312,7 +317,6 @@ static struct ctl_table_header *icmp_sysctl_header;
 static struct ctl_table icmp_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_icmp_timeout",
-		.data		= &nf_ct_icmp_timeout,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -323,7 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = {
 static struct ctl_table icmp_compat_sysctl_table[] = {
 	{
 		.procname	= "ip_conntrack_icmp_timeout",
-		.data		= &nf_ct_icmp_timeout,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -333,6 +336,34 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
+static int icmp_init_net(struct net *net)
+{
+	struct nf_icmp_net *in = icmp_pernet(net);
+	struct nf_proto_net *pn = (struct nf_proto_net *)in;
+	in->timeout = nf_ct_icmp_timeout;
+
+#ifdef CONFIG_SYSCTL
+	pn->ctl_table = kmemdup(icmp_sysctl_table,
+				sizeof(icmp_sysctl_table),
+				GFP_KERNEL);
+	if (!pn->ctl_table)
+		return -ENOMEM;
+	pn->ctl_table[0].data = &in->timeout;
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+				       sizeof(icmp_compat_sysctl_table),
+				       GFP_KERNEL);
+	if (!pn->ctl_compat_table) {
+		kfree(pn->ctl_table);
+		pn->ctl_table = NULL;
+		return -ENOMEM;
+	}
+	pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+	return 0;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -369,4 +400,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 	.ctl_compat_table	= icmp_compat_sysctl_table,
 #endif
 #endif
+	.init_net		= icmp_init_net,
 };
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 9c6aee51dea2..dbade5f2b1d3 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -307,6 +307,8 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 		return (struct nf_proto_net *)&net->ct.nf_ct_proto.tcp;
 	case IPPROTO_UDP:
 		return (struct nf_proto_net *)&net->ct.nf_ct_proto.udp;
+	case IPPROTO_ICMP:
+		return (struct nf_proto_net *)&net->ct.nf_ct_proto.icmp;
 	case 255: /* l4proto_generic */
 		return (struct nf_proto_net *)&net->ct.nf_ct_proto.generic;
 	default:
-- 
cgit v1.2.3


From 3ea04dd3a78916db9186a602b6ce974d36a33fbb Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 28 May 2012 21:04:16 +0000
Subject: netfilter: nf_ct_ipv4: add namespace support

This patch adds namespace support for IPv4 protocol tracker.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 123 +++++++++++++++++--------
 1 file changed, 85 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0c0fb906c19d..5c66203af51c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -207,35 +207,30 @@ static int log_invalid_proto_max = 255;
 static ctl_table ip_ct_sysctl_table[] = {
 	{
 		.procname	= "ip_conntrack_max",
-		.data		= &nf_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "ip_conntrack_count",
-		.data		= &init_net.ct.count,
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "ip_conntrack_buckets",
-		.data		= &init_net.ct.htable_size,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "ip_conntrack_checksum",
-		.data		= &init_net.ct.sysctl_checksum,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "ip_conntrack_log_invalid",
-		.data		= &init_net.ct.sysctl_log_invalid,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -351,6 +346,25 @@ static struct nf_sockopt_ops so_getorigdst = {
 	.owner		= THIS_MODULE,
 };
 
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	struct nf_ip_net *in = &net->ct.nf_ct_proto;
+	in->ctl_table = kmemdup(ip_ct_sysctl_table,
+				sizeof(ip_ct_sysctl_table),
+				GFP_KERNEL);
+	if (!in->ctl_table)
+		return -ENOMEM;
+
+	in->ctl_table[0].data = &nf_conntrack_max;
+	in->ctl_table[1].data = &net->ct.count;
+	in->ctl_table[2].data = &net->ct.htable_size;
+	in->ctl_table[3].data = &net->ct.sysctl_checksum;
+	in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+	return 0;
+}
+
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.l3proto	 = PF_INET,
 	.name		 = "ipv4",
@@ -368,6 +382,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.ctl_table_path  = "net/ipv4/netfilter",
 	.ctl_table	 = ip_ct_sysctl_table,
 #endif
+	.init_net	 = ipv4_init_net,
 	.me		 = THIS_MODULE,
 };
 
@@ -378,6 +393,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
 MODULE_ALIAS("ip_conntrack");
 MODULE_LICENSE("GPL");
 
+static int ipv4_net_init(struct net *net)
+{
+	int ret = 0;
+
+	ret = nf_conntrack_l4proto_register(net,
+					    &nf_conntrack_l4proto_tcp4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
+		goto out_tcp;
+	}
+	ret = nf_conntrack_l4proto_register(net,
+					    &nf_conntrack_l4proto_udp4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
+		goto out_udp;
+	}
+	ret = nf_conntrack_l4proto_register(net,
+					    &nf_conntrack_l4proto_icmp);
+	if (ret < 0) {
+		pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
+		goto out_icmp;
+	}
+	ret = nf_conntrack_l3proto_register(net,
+					    &nf_conntrack_l3proto_ipv4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
+		goto out_ipv4;
+	}
+	return 0;
+out_ipv4:
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_icmp);
+out_icmp:
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_udp4);
+out_udp:
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_tcp4);
+out_tcp:
+	return ret;
+}
+
+static void ipv4_net_exit(struct net *net)
+{
+	nf_conntrack_l3proto_unregister(net,
+					&nf_conntrack_l3proto_ipv4);
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_icmp);
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_udp4);
+	nf_conntrack_l4proto_unregister(net,
+					&nf_conntrack_l4proto_tcp4);
+}
+
+static struct pernet_operations ipv4_net_ops = {
+	.init = ipv4_net_init,
+	.exit = ipv4_net_exit,
+};
+
 static int __init nf_conntrack_l3proto_ipv4_init(void)
 {
 	int ret = 0;
@@ -391,35 +465,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 		return ret;
 	}
 
-	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_tcp4);
+	ret = register_pernet_subsys(&ipv4_net_ops);
 	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register tcp.\n");
+		pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
 		goto cleanup_sockopt;
 	}
 
-	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_udp4);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register udp.\n");
-		goto cleanup_tcp;
-	}
-
-	ret = nf_conntrack_l4proto_register(&init_net, &nf_conntrack_l4proto_icmp);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register icmp.\n");
-		goto cleanup_udp;
-	}
-
-	ret = nf_conntrack_l3proto_register(&init_net, &nf_conntrack_l3proto_ipv4);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register ipv4\n");
-		goto cleanup_icmp;
-	}
-
 	ret = nf_register_hooks(ipv4_conntrack_ops,
 				ARRAY_SIZE(ipv4_conntrack_ops));
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv4: can't register hooks.\n");
-		goto cleanup_ipv4;
+		goto cleanup_pernet;
 	}
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
 	ret = nf_conntrack_ipv4_compat_init();
@@ -431,14 +487,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
  cleanup_hooks:
 	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
 #endif
- cleanup_ipv4:
-	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv4);
- cleanup_icmp:
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
- cleanup_udp:
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp4);
- cleanup_tcp:
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp4);
+ cleanup_pernet:
+	unregister_pernet_subsys(&ipv4_net_ops);
  cleanup_sockopt:
 	nf_unregister_sockopt(&so_getorigdst);
 	return ret;
@@ -451,10 +501,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
 	nf_conntrack_ipv4_compat_fini();
 #endif
 	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-	nf_conntrack_l3proto_unregister(&init_net, &nf_conntrack_l3proto_ipv4);
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_icmp);
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_udp4);
-	nf_conntrack_l4proto_unregister(&init_net, &nf_conntrack_l4proto_tcp4);
+	unregister_pernet_subsys(&ipv4_net_ops);
 	nf_unregister_sockopt(&so_getorigdst);
 }
 
-- 
cgit v1.2.3


From e76d0af5e45f4152e3fdcc103b753a8aff93fcb5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 5 Jun 2012 18:35:48 +0200
Subject: netfilter: nf_conntrack: remove now unused sysctl for
 nf_conntrack_l[3|4]proto

Since the sysctl data for l[3|4]proto now resides in pernet nf_proto_net.
We can now remove this unused fields from struct nf_contrack_l[3,4]proto.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h   |  2 --
 include/net/netfilter/nf_conntrack_l4proto.h   | 10 ----------
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  1 -
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  8 --------
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  5 -----
 net/netfilter/nf_conntrack_proto_generic.c     |  8 --------
 net/netfilter/nf_conntrack_proto_sctp.c        | 15 ---------------
 net/netfilter/nf_conntrack_proto_tcp.c         | 15 ---------------
 net/netfilter/nf_conntrack_proto_udp.c         | 15 ---------------
 net/netfilter/nf_conntrack_proto_udplite.c     | 12 ------------
 10 files changed, 91 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index d6df8c71a7fe..6f7c13f4ac03 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -64,9 +64,7 @@ struct nf_conntrack_l3proto {
 	size_t nla_size;
 
 #ifdef CONFIG_SYSCTL
-	struct ctl_table_header	*ctl_table_header;
 	const char		*ctl_table_path;
-	struct ctl_table	*ctl_table;
 #endif /* CONFIG_SYSCTL */
 
 	/* Init l3proto pernet data */
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index d621c91de5c8..cfa2f89b031d 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -93,16 +93,6 @@ struct nf_conntrack_l4proto {
 		unsigned int nlattr_max;
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
-#endif
-
-#ifdef CONFIG_SYSCTL
-	struct ctl_table_header	**ctl_table_header;
-	struct ctl_table	*ctl_table;
-	unsigned int		*ctl_table_users;
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	struct ctl_table_header	*ctl_compat_table_header;
-	struct ctl_table	*ctl_compat_table;
-#endif
 #endif
 	int	*net_id;
 	/* Init l4proto pernet data */
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5c66203af51c..d79b961a8009 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -380,7 +380,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 #endif
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
 	.ctl_table_path  = "net/ipv4/netfilter",
-	.ctl_table	 = ip_ct_sysctl_table,
 #endif
 	.init_net	 = ipv4_init_net,
 	.me		 = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a0eabeb36b9f..2bca7a5e422b 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -313,7 +313,6 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmp_sysctl_header;
 static struct ctl_table icmp_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_icmp_timeout",
@@ -393,12 +392,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 		.nla_policy	= icmp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_header	= &icmp_sysctl_header,
-	.ctl_table		= icmp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	.ctl_compat_table	= icmp_compat_sysctl_table,
-#endif
-#endif
 	.init_net		= icmp_init_net,
 };
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index f606355200d8..1b7818f15f3d 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -320,7 +320,6 @@ icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmpv6_sysctl_header;
 static struct ctl_table icmpv6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_icmpv6_timeout",
@@ -375,9 +374,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 		.nla_policy	= icmpv6_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_header	= &icmpv6_sysctl_header,
-	.ctl_table		= icmpv6_sysctl_table,
-#endif
 	.init_net		= icmpv6_init_net,
 };
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 19bc880eb4e2..e4e2d2a38d3f 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -111,7 +111,6 @@ generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *generic_sysctl_header;
 static struct ctl_table generic_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_generic_timeout",
@@ -182,12 +181,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 		.nla_policy	= generic_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_header	= &generic_sysctl_header,
-	.ctl_table		= generic_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	.ctl_compat_table	= generic_compat_sysctl_table,
-#endif
-#endif
 	.init_net		= generic_init_net,
 };
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 9e5738db34df..d785f2c4182b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -610,8 +610,6 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
 
 
 #ifdef CONFIG_SYSCTL
-static unsigned int sctp_sysctl_table_users;
-static struct ctl_table_header *sctp_sysctl_header;
 static struct ctl_table sctp_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_sctp_timeout_closed",
@@ -832,14 +830,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 		.nla_policy	= sctp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &sctp_sysctl_table_users,
-	.ctl_table_header	= &sctp_sysctl_header,
-	.ctl_table		= sctp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	.ctl_compat_table	= sctp_compat_sysctl_table,
-#endif
-#endif
 	.net_id			= &sctp_net_id,
 	.init_net		= sctpv4_init_net,
 };
@@ -873,11 +863,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 		.nla_policy	= sctp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#endif
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &sctp_sysctl_table_users,
-	.ctl_table_header	= &sctp_sysctl_header,
-	.ctl_table		= sctp_sysctl_table,
 #endif
 	.net_id			= &sctp_net_id,
 	.init_net		= sctpv6_init_net,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index a053f6786b3c..e57f2a888dae 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1364,8 +1364,6 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static unsigned int tcp_sysctl_table_users;
-static struct ctl_table_header *tcp_sysctl_header;
 static struct ctl_table tcp_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_tcp_timeout_syn_sent",
@@ -1684,14 +1682,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &tcp_sysctl_table_users,
-	.ctl_table_header	= &tcp_sysctl_header,
-	.ctl_table		= tcp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	.ctl_compat_table	= tcp_compat_sysctl_table,
-#endif
-#endif
 	.init_net		= tcpv4_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
@@ -1728,11 +1718,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &tcp_sysctl_table_users,
-	.ctl_table_header	= &tcp_sysctl_header,
-	.ctl_table		= tcp_sysctl_table,
-#endif
 	.init_net		= tcpv6_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index f56c8905ddfb..db7abad44bc5 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -199,8 +199,6 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static unsigned int udp_sysctl_table_users;
-static struct ctl_table_header *udp_sysctl_header;
 static struct ctl_table udp_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_udp_timeout",
@@ -343,14 +341,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 		.nla_policy	= udp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &udp_sysctl_table_users,
-	.ctl_table_header	= &udp_sysctl_header,
-	.ctl_table		= udp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	.ctl_compat_table	= udp_compat_sysctl_table,
-#endif
-#endif
 	.init_net		= udpv4_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
@@ -382,11 +372,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 		.nla_policy	= udp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &udp_sysctl_table_users,
-	.ctl_table_header	= &udp_sysctl_header,
-	.ctl_table		= udp_sysctl_table,
-#endif
 	.init_net		= udpv6_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 7f85b0850a44..2e25e985e8cf 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -215,8 +215,6 @@ udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
 #ifdef CONFIG_SYSCTL
-static unsigned int udplite_sysctl_table_users;
-static struct ctl_table_header *udplite_sysctl_header;
 static struct ctl_table udplite_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_udplite_timeout",
@@ -287,11 +285,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
 		.nla_policy	= udplite_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &udplite_sysctl_table_users,
-	.ctl_table_header	= &udplite_sysctl_header,
-	.ctl_table		= udplite_sysctl_table,
-#endif
 	.net_id			= &udplite_net_id,
 	.init_net		= udplite_init_net,
 };
@@ -324,11 +317,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
 		.nla_policy	= udplite_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-	.ctl_table_users	= &udplite_sysctl_table_users,
-	.ctl_table_header	= &udplite_sysctl_header,
-	.ctl_table		= udplite_sysctl_table,
-#endif
 	.net_id			= &udplite_net_id,
 	.init_net		= udplite_init_net,
 };
-- 
cgit v1.2.3


From 8264deb81853462da5cbcfb19b54c4fd9f3d88ba Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Mon, 28 May 2012 21:04:23 +0000
Subject: netfilter: nf_conntrack: add namespace support for cttimeout

This patch adds namespace support for cttimeout.

Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  3 ++-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  6 ++++--
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  6 ++++--
 net/netfilter/nf_conntrack_proto_dccp.c        |  5 +++--
 net/netfilter/nf_conntrack_proto_generic.c     |  6 ++++--
 net/netfilter/nf_conntrack_proto_gre.c         |  8 +++++---
 net/netfilter/nf_conntrack_proto_sctp.c        |  6 ++++--
 net/netfilter/nf_conntrack_proto_tcp.c         |  6 ++++--
 net/netfilter/nf_conntrack_proto_udp.c         |  8 +++++---
 net/netfilter/nf_conntrack_proto_udplite.c     |  8 +++++---
 net/netfilter/nfnetlink_cttimeout.c            | 13 ++++++++-----
 11 files changed, 48 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index cfa2f89b031d..81c52b5205f2 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -87,7 +87,8 @@ struct nf_conntrack_l4proto {
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
 	struct {
 		size_t obj_size;
-		int (*nlattr_to_obj)(struct nlattr *tb[], void *data);
+		int (*nlattr_to_obj)(struct nlattr *tb[],
+				     struct net *net, void *data);
 		int (*obj_to_nlattr)(struct sk_buff *skb, const void *data);
 
 		unsigned int nlattr_max;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 2bca7a5e422b..041923cb67ad 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -279,16 +279,18 @@ static int icmp_nlattr_tuple_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+				      struct net *net, void *data)
 {
 	unsigned int *timeout = data;
+	struct nf_icmp_net *in = icmp_pernet(net);
 
 	if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
 		*timeout =
 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
 	} else {
 		/* Set default ICMP timeout. */
-		*timeout = nf_ct_icmp_timeout;
+		*timeout = in->timeout;
 	}
 	return 0;
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 1b7818f15f3d..63ed0121836c 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -286,16 +286,18 @@ static int icmpv6_nlattr_tuple_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
+					struct net *net, void *data)
 {
 	unsigned int *timeout = data;
+	struct nf_icmp_net *in = icmpv6_pernet(net);
 
 	if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
 		*timeout =
 		    ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
 	} else {
 		/* Set default ICMPv6 timeout. */
-		*timeout = nf_ct_icmpv6_timeout;
+		*timeout = in->timeout;
 	}
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 8d798a613e3f..c33f76af913f 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -712,9 +712,10 @@ static int dccp_nlattr_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
+				      struct net *net, void *data)
 {
-	struct dccp_net *dn = dccp_pernet(&init_net);
+	struct dccp_net *dn = dccp_pernet(net);
 	unsigned int *timeouts = data;
 	int i;
 
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index e4e2d2a38d3f..bb0e74fe0fae 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -75,16 +75,18 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
+					 struct net *net, void *data)
 {
 	unsigned int *timeout = data;
+	struct nf_generic_net *gn = generic_pernet(net);
 
 	if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
 		*timeout =
 		    ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ;
 	else {
 		/* Set default generic timeout. */
-		*timeout = nf_ct_generic_timeout;
+		*timeout = gn->timeout;
 	}
 
 	return 0;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index e36973f9ef59..25ba5a2f5edc 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -304,13 +304,15 @@ static void gre_destroy(struct nf_conn *ct)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
+				     struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
+	struct netns_proto_gre *net_gre = gre_pernet(net);
 
 	/* set default timeouts for GRE. */
-	timeouts[GRE_CT_UNREPLIED] = gre_timeouts[GRE_CT_UNREPLIED];
-	timeouts[GRE_CT_REPLIED] = gre_timeouts[GRE_CT_REPLIED];
+	timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
+	timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
 
 	if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
 		timeouts[GRE_CT_UNREPLIED] =
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index d785f2c4182b..8fb0582ad397 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -562,14 +562,16 @@ static int sctp_nlattr_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
+				      struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
+	struct sctp_net *sn = sctp_pernet(net);
 	int i;
 
 	/* set default SCTP timeouts. */
 	for (i=0; i<SCTP_CONNTRACK_MAX; i++)
-		timeouts[i] = sctp_timeouts[i];
+		timeouts[i] = sn->timeouts[i];
 
 	/* there's a 1:1 mapping between attributes and protocol states. */
 	for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e57f2a888dae..1cff854ccb88 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1260,14 +1260,16 @@ static int tcp_nlattr_tuple_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
+				     struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
+	struct nf_tcp_net *tn = tcp_pernet(net);
 	int i;
 
 	/* set default TCP timeouts. */
 	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
-		timeouts[i] = tcp_timeouts[i];
+		timeouts[i] = tn->timeouts[i];
 
 	if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
 		timeouts[TCP_CONNTRACK_SYN_SENT] =
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index db7abad44bc5..360565a95de4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -156,13 +156,15 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
+				     struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
+	struct nf_udp_net *un = udp_pernet(net);
 
 	/* set default timeouts for UDP. */
-	timeouts[UDP_CT_UNREPLIED] = udp_timeouts[UDP_CT_UNREPLIED];
-	timeouts[UDP_CT_REPLIED] = udp_timeouts[UDP_CT_REPLIED];
+	timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
+	timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
 
 	if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) {
 		timeouts[UDP_CT_UNREPLIED] =
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 2e25e985e8cf..b32e700f8dde 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -172,13 +172,15 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
 
-static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
+					 struct net *net, void *data)
 {
 	unsigned int *timeouts = data;
+	struct udplite_net *un = udplite_pernet(net);
 
 	/* set default timeouts for UDPlite. */
-	timeouts[UDPLITE_CT_UNREPLIED] = udplite_timeouts[UDPLITE_CT_UNREPLIED];
-	timeouts[UDPLITE_CT_REPLIED] = udplite_timeouts[UDPLITE_CT_REPLIED];
+	timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
+	timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
 
 	if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
 		timeouts[UDPLITE_CT_UNREPLIED] =
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 3e655288d1d6..cdecbc8fe965 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -49,8 +49,9 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
 
 static int
 ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,
-			       struct nf_conntrack_l4proto *l4proto,
-			       const struct nlattr *attr)
+			  struct nf_conntrack_l4proto *l4proto,
+			  struct net *net,
+			  const struct nlattr *attr)
 {
 	int ret = 0;
 
@@ -60,7 +61,8 @@ ctnl_timeout_parse_policy(struct ctnl_timeout *timeout,
 		nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
 				 attr, l4proto->ctnl_timeout.nla_policy);
 
-		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, &timeout->data);
+		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net,
+							  &timeout->data);
 	}
 	return ret;
 }
@@ -74,6 +76,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
 	__u8 l4num;
 	struct nf_conntrack_l4proto *l4proto;
 	struct ctnl_timeout *timeout, *matching = NULL;
+	struct net *net = sock_net(skb->sk);
 	char *name;
 	int ret;
 
@@ -117,7 +120,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
 				goto err_proto_put;
 			}
 
-			ret = ctnl_timeout_parse_policy(matching, l4proto,
+			ret = ctnl_timeout_parse_policy(matching, l4proto, net,
 							cda[CTA_TIMEOUT_DATA]);
 			return ret;
 		}
@@ -132,7 +135,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
 		goto err_proto_put;
 	}
 
-	ret = ctnl_timeout_parse_policy(timeout, l4proto,
+	ret = ctnl_timeout_parse_policy(timeout, l4proto, net,
 					cda[CTA_TIMEOUT_DATA]);
 	if (ret < 0)
 		goto err;
-- 
cgit v1.2.3


From 89a48e35f54aebe83133d5ce450f0ced6551b5b8 Mon Sep 17 00:00:00 2001
From: Alban Crequy <alban.crequy@collabora.co.uk>
Date: Mon, 14 May 2012 03:56:37 +0000
Subject: netfilter: ipv4, defrag: switch hook PFs to nfproto

This patch is a cleanup. Use NFPROTO_* for consistency with other
netfilter code.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
Reviewed-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
---
 net/ipv4/netfilter/nf_defrag_ipv4.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9bb1b8a37a22..742815518b0f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
 	{
 		.hook		= ipv4_conntrack_defrag,
 		.owner		= THIS_MODULE,
-		.pf		= PF_INET,
+		.pf		= NFPROTO_IPV4,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
 	},
 	{
 		.hook           = ipv4_conntrack_defrag,
 		.owner          = THIS_MODULE,
-		.pf             = PF_INET,
+		.pf             = NFPROTO_IPV4,
 		.hooknum        = NF_INET_LOCAL_OUT,
 		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
 	},
-- 
cgit v1.2.3


From c8a627ed06d6d49bf65015a2185c519335c4c83f Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Fri, 8 Jun 2012 01:20:41 +0000
Subject: inetpeer: add namespace support for inetpeer

now inetpeer doesn't support namespace,the information will
be leaking across namespace.

this patch move the global vars v4_peers and v6_peers to
netns_ipv4 and netns_ipv6 as a field peers.

add struct pernet_operations inetpeer_ops to initial pernet
inetpeer data.

and change family_to_base and inet_getpeer to support namespace.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h   | 10 ++++---
 include/net/netns/ipv4.h |  2 +-
 include/net/netns/ipv6.h |  1 +
 net/ipv4/inetpeer.c      | 68 +++++++++++++++++++++++++++++++++++-------------
 net/ipv4/route.c         |  2 +-
 5 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 2040bff945d4..fef9dfab5d45 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -75,7 +75,9 @@ static inline bool inet_metrics_new(const struct inet_peer *p)
 }
 
 /* can be called with or without local BH being disabled */
-struct inet_peer	*inet_getpeer(const struct inetpeer_addr *daddr, int create);
+struct inet_peer *inet_getpeer(struct net *net,
+			       const struct inetpeer_addr *daddr,
+			       int create);
 
 static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
 {
@@ -83,7 +85,7 @@ static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
 
 	daddr.addr.a4 = v4daddr;
 	daddr.family = AF_INET;
-	return inet_getpeer(&daddr, create);
+	return inet_getpeer(&init_net, &daddr, create);
 }
 
 static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr, int create)
@@ -92,14 +94,14 @@ static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr,
 
 	*(struct in6_addr *)daddr.addr.a6 = *v6daddr;
 	daddr.family = AF_INET6;
-	return inet_getpeer(&daddr, create);
+	return inet_getpeer(&init_net, &daddr, create);
 }
 
 /* can be called from BH context or outside */
 extern void inet_putpeer(struct inet_peer *p);
 extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
-extern void inetpeer_invalidate_tree(int family);
+extern void inetpeer_invalidate_tree(struct net *net, int family);
 
 /*
  * temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index bbd023a1c9b9..227f0cd9d3f6 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -30,7 +30,7 @@ struct netns_ipv4 {
 
 	struct sock		**icmp_sk;
 	struct sock		*tcp_sock;
-
+	struct inet_peer_base	*peers;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b42be53587ba..df0a5456a3fd 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -33,6 +33,7 @@ struct netns_ipv6 {
 	struct netns_sysctl_ipv6 sysctl;
 	struct ipv6_devconf	*devconf_all;
 	struct ipv6_devconf	*devconf_dflt;
+	struct inet_peer_base	*peers;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*ip6table_filter;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index dfba343b2509..1c8527349c86 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -88,18 +88,6 @@ struct inet_peer_base {
 	int		total;
 };
 
-static struct inet_peer_base v4_peers = {
-	.root		= peer_avl_empty_rcu,
-	.lock		= __SEQLOCK_UNLOCKED(v4_peers.lock),
-	.total		= 0,
-};
-
-static struct inet_peer_base v6_peers = {
-	.root		= peer_avl_empty_rcu,
-	.lock		= __SEQLOCK_UNLOCKED(v6_peers.lock),
-	.total		= 0,
-};
-
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 
 /* Exported for sysctl_net_ipv4.  */
@@ -153,6 +141,46 @@ static void inetpeer_gc_worker(struct work_struct *work)
 	schedule_delayed_work(&gc_work, gc_delay);
 }
 
+static int __net_init inetpeer_net_init(struct net *net)
+{
+	net->ipv4.peers = kzalloc(sizeof(struct inet_peer_base),
+				  GFP_KERNEL);
+	if (net->ipv4.peers == NULL)
+		return -ENOMEM;
+
+	net->ipv4.peers->root = peer_avl_empty_rcu;
+	seqlock_init(&net->ipv4.peers->lock);
+
+	net->ipv6.peers = kzalloc(sizeof(struct inet_peer_base),
+				  GFP_KERNEL);
+	if (net->ipv6.peers == NULL)
+		goto out_ipv6;
+
+	net->ipv6.peers->root = peer_avl_empty_rcu;
+	seqlock_init(&net->ipv6.peers->lock);
+
+	return 0;
+out_ipv6:
+	kfree(net->ipv4.peers);
+	return -ENOMEM;
+}
+
+static void __net_exit inetpeer_net_exit(struct net *net)
+{
+	inetpeer_invalidate_tree(net, AF_INET);
+	kfree(net->ipv4.peers);
+	net->ipv4.peers = NULL;
+
+	inetpeer_invalidate_tree(net, AF_INET6);
+	kfree(net->ipv6.peers);
+	net->ipv6.peers = NULL;
+}
+
+static struct pernet_operations inetpeer_ops = {
+	.init = inetpeer_net_init,
+	.exit = inetpeer_net_exit,
+};
+
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
 {
@@ -177,6 +205,7 @@ void __init inet_initpeers(void)
 			NULL);
 
 	INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
+	register_pernet_subsys(&inetpeer_ops);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
@@ -401,9 +430,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
 	call_rcu(&p->rcu, inetpeer_free_rcu);
 }
 
-static struct inet_peer_base *family_to_base(int family)
+static struct inet_peer_base *family_to_base(struct net *net,
+					     int family)
 {
-	return family == AF_INET ? &v4_peers : &v6_peers;
+	return family == AF_INET ? net->ipv4.peers : net->ipv6.peers;
 }
 
 /* perform garbage collect on all items stacked during a lookup */
@@ -443,10 +473,12 @@ static int inet_peer_gc(struct inet_peer_base *base,
 	return cnt;
 }
 
-struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(struct net *net,
+			       const struct inetpeer_addr *daddr,
+			       int create)
 {
 	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
-	struct inet_peer_base *base = family_to_base(daddr->family);
+	struct inet_peer_base *base = family_to_base(net, daddr->family);
 	struct inet_peer *p;
 	unsigned int sequence;
 	int invalidated, gccnt = 0;
@@ -571,10 +603,10 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
 	schedule_delayed_work(&gc_work, gc_delay);
 }
 
-void inetpeer_invalidate_tree(int family)
+void inetpeer_invalidate_tree(struct net *net, int family)
 {
 	struct inet_peer *old, *new, *prev;
-	struct inet_peer_base *base = family_to_base(family);
+	struct inet_peer_base *base = family_to_base(net, family);
 
 	write_seqlock_bh(&base->lock);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 98b30d08efe9..006c21cc2324 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -938,7 +938,7 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	inetpeer_invalidate_tree(AF_INET);
+	inetpeer_invalidate_tree(net, AF_INET);
 }
 
 /*
-- 
cgit v1.2.3


From 54db0cc2ba0d38166acc2d6bae21721405305537 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Fri, 8 Jun 2012 01:21:40 +0000
Subject: inetpeer: add parameter net for inet_getpeer_v4,v6

add struct net as a parameter of inet_getpeer_v[4,6],
use net to replace &init_net.

and modify some places to provide net for inet_getpeer_v[4,6]

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h   | 12 ++++++++----
 net/ipv4/inet_fragment.c |  2 +-
 net/ipv4/ip_fragment.c   |  6 +++++-
 net/ipv4/route.c         |  8 +++++---
 net/ipv4/tcp_ipv4.c      |  6 ++++--
 net/ipv6/route.c         |  3 ++-
 net/ipv6/tcp_ipv6.c      |  6 ++++--
 7 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index fef9dfab5d45..20e67db69ac9 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -79,22 +79,26 @@ struct inet_peer *inet_getpeer(struct net *net,
 			       const struct inetpeer_addr *daddr,
 			       int create);
 
-static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create)
+static inline struct inet_peer *inet_getpeer_v4(struct net *net,
+						__be32 v4daddr,
+						int create)
 {
 	struct inetpeer_addr daddr;
 
 	daddr.addr.a4 = v4daddr;
 	daddr.family = AF_INET;
-	return inet_getpeer(&init_net, &daddr, create);
+	return inet_getpeer(net, &daddr, create);
 }
 
-static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr, int create)
+static inline struct inet_peer *inet_getpeer_v6(struct net *net,
+						const struct in6_addr *v6daddr,
+						int create)
 {
 	struct inetpeer_addr daddr;
 
 	*(struct in6_addr *)daddr.addr.a6 = *v6daddr;
 	daddr.family = AF_INET6;
-	return inet_getpeer(&init_net, &daddr, create);
+	return inet_getpeer(net, &daddr, create);
 }
 
 /* can be called from BH context or outside */
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0c..85190e69297b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	if (q == NULL)
 		return NULL;
 
+	q->net = nf;
 	f->constructor(q, arg);
 	atomic_add(f->qsize, &nf->mem);
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
 	atomic_set(&q->refcnt, 1);
-	q->net = nf;
 
 	return q;
 }
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dbd3dd6022d..22c6bab9717a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
 static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
+	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+					       frags);
+	struct net *net = container_of(ipv4, struct net, ipv4);
+
 	struct ip4_create_arg *arg = a;
 
 	qp->protocol = arg->iph->protocol;
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
 	qp->peer = sysctl_ipfrag_max_dist ?
-		inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+		inet_getpeer_v4(net, arg->iph->saddr, 1) : NULL;
 }
 
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 006c21cc2324..2c9f73f3b7cc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1328,9 +1328,10 @@ static u32 rt_peer_genid(void)
 
 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 {
+	struct net *net = dev_net(rt->dst.dev);
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v4(daddr, create);
+	peer = inet_getpeer_v4(net, daddr, create);
 
 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
@@ -1694,7 +1695,7 @@ unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
 	unsigned short est_mtu = 0;
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v4(iph->daddr, 1);
+	peer = inet_getpeer_v4(net, iph->daddr, 1);
 	if (peer) {
 		unsigned short mtu = new_mtu;
 
@@ -1935,6 +1936,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 			    struct fib_info *fi)
 {
+	struct net *net = dev_net(rt->dst.dev);
 	struct inet_peer *peer;
 	int create = 0;
 
@@ -1944,7 +1946,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
 		create = 1;
 
-	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
+	rt->peer = peer = inet_getpeer_v4(net, rt->rt_dst, create);
 	if (peer) {
 		rt->rt_peer_genid = rt_peer_genid();
 		if (inet_metrics_new(peer))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3d9c1a4b8819..f485b451f928 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1824,11 +1824,12 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
 {
 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
 	struct inet_peer *peer;
 
 	if (!rt ||
 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
-		peer = inet_getpeer_v4(inet->inet_daddr, 1);
+		peer = inet_getpeer_v4(net, inet->inet_daddr, 1);
 		*release_it = true;
 	} else {
 		if (!rt->peer)
@@ -1844,8 +1845,9 @@ EXPORT_SYMBOL(tcp_v4_get_peer);
 void *tcp_v4_tw_get_peer(struct sock *sk)
 {
 	const struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct net *net = sock_net(sk);
 
-	return inet_getpeer_v4(tw->tw_daddr, 1);
+	return inet_getpeer_v4(net, tw->tw_daddr, 1);
 }
 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 999a982ad3fd..4eca0130cce7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -306,9 +306,10 @@ static u32 rt6_peer_genid(void)
 
 void rt6_bind_peer(struct rt6_info *rt, int create)
 {
+	struct net *net = dev_net(rt->dst.dev);
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
+	peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create);
 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
 	else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 80758255556c..1a9cdd09f11c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1736,11 +1736,12 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it)
 {
 	struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
 	struct inet_peer *peer;
 
 	if (!rt ||
 	    !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) {
-		peer = inet_getpeer_v6(&np->daddr, 1);
+		peer = inet_getpeer_v6(net, &np->daddr, 1);
 		*release_it = true;
 	} else {
 		if (!rt->rt6i_peer)
@@ -1756,11 +1757,12 @@ static void *tcp_v6_tw_get_peer(struct sock *sk)
 {
 	const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
 	const struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct net *net = sock_net(sk);
 
 	if (tw->tw_family == AF_INET)
 		return tcp_v4_tw_get_peer(sk);
 
-	return inet_getpeer_v6(&tw6->tw_v6_daddr, 1);
+	return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1);
 }
 
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
-- 
cgit v1.2.3


From fbfe95a42e90b3dd079cc9019ba7d7700feee0f6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 8 Jun 2012 23:24:18 -0700
Subject: inet: Create and use rt{,6}_get_peer_create().

There's a lot of places that open-code rt{,6}_get_peer() only because
they want to set 'create' to one.  So add an rt{,6}_get_peer_create()
for their sake.

There were also a few spots open-coding plain rt{,6}_get_peer() and
those are transformed here as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 17 +++++++++++++----
 include/net/route.h     | 14 ++++++++++++--
 net/ipv4/icmp.c         |  5 ++---
 net/ipv4/route.c        | 35 +++++++++--------------------------
 net/ipv4/tcp_ipv4.c     |  4 +---
 net/ipv6/icmp.c         |  6 +++---
 net/ipv6/ip6_output.c   | 11 ++++-------
 net/ipv6/ndisc.c        |  6 +++---
 net/ipv6/route.c        |  5 +----
 net/ipv6/tcp_ipv6.c     |  4 +---
 10 files changed, 49 insertions(+), 58 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 37c1a1ed82c1..73d750288121 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -53,18 +53,27 @@ static inline unsigned int rt6_flags2srcprefs(int flags)
 	return (flags >> 3) & 7;
 }
 
-extern void			rt6_bind_peer(struct rt6_info *rt,
-					      int create);
+extern void rt6_bind_peer(struct rt6_info *rt, int create);
 
-static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt)
+static inline struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
 {
 	if (rt->rt6i_peer)
 		return rt->rt6i_peer;
 
-	rt6_bind_peer(rt, 0);
+	rt6_bind_peer(rt, create);
 	return rt->rt6i_peer;
 }
 
+static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt)
+{
+	return __rt6_get_peer(rt, 0);
+}
+
+static inline struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+{
+	return __rt6_get_peer(rt, 1);
+}
+
 extern void			ip6_route_input(struct sk_buff *skb);
 
 extern struct dst_entry *	ip6_route_output(struct net *net,
diff --git a/include/net/route.h b/include/net/route.h
index ed2b78e2375d..433fc6c1d404 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -296,15 +296,25 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
 
 extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
 
-static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
+static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
 {
 	if (rt->peer)
 		return rt->peer;
 
-	rt_bind_peer(rt, daddr, 0);
+	rt_bind_peer(rt, daddr, create);
 	return rt->peer;
 }
 
+static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
+{
+	return __rt_get_peer(rt, daddr, 0);
+}
+
+static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr)
+{
+	return __rt_get_peer(rt, daddr, 1);
+}
+
 static inline int inet_iif(const struct sk_buff *skb)
 {
 	return skb_rtable(skb)->rt_iif;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c75efbdc71cb..0c78ef1e5dde 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -253,9 +253,8 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 
 	/* Limit if icmp type is enabled in ratemask. */
 	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-		if (!rt->peer)
-			rt_bind_peer(rt, fl4->daddr, 1);
-		rc = inet_peer_xrlim_allow(rt->peer,
+		struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr);
+		rc = inet_peer_xrlim_allow(peer,
 					   net->ipv4.sysctl_icmp_ratelimit);
 	}
 out:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2c9f73f3b7cc..7a4d724765e6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -162,10 +162,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 	struct inet_peer *peer;
 	u32 *p = NULL;
 
-	if (!rt->peer)
-		rt_bind_peer(rt, rt->rt_dst, 1);
-
-	peer = rt->peer;
+	peer = rt_get_peer_create(rt, rt->rt_dst);
 	if (peer) {
 		u32 *old_p = __DST_METRICS_PTR(old);
 		unsigned long prev, new;
@@ -1364,14 +1361,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 	struct rtable *rt = (struct rtable *) dst;
 
 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
-		if (rt->peer == NULL)
-			rt_bind_peer(rt, rt->rt_dst, 1);
+		struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
 
 		/* If peer is attached to destination, it is never detached,
 		   so that we need not to grab a lock to dereference it.
 		 */
-		if (rt->peer) {
-			iph->id = htons(inet_getid(rt->peer, more));
+		if (peer) {
+			iph->id = htons(inet_getid(peer, more));
 			return;
 		}
 	} else if (!rt)
@@ -1481,10 +1477,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				    rt->rt_gateway != old_gw)
 					continue;
 
-				if (!rt->peer)
-					rt_bind_peer(rt, rt->rt_dst, 1);
-
-				peer = rt->peer;
+				peer = rt_get_peer_create(rt, rt->rt_dst);
 				if (peer) {
 					if (peer->redirect_learned.a4 != new_gw) {
 						peer->redirect_learned.a4 = new_gw;
@@ -1579,9 +1572,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 	rcu_read_unlock();
 
-	if (!rt->peer)
-		rt_bind_peer(rt, rt->rt_dst, 1);
-	peer = rt->peer;
+	peer = rt_get_peer_create(rt, rt->rt_dst);
 	if (!peer) {
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 		return;
@@ -1646,9 +1637,7 @@ static int ip_error(struct sk_buff *skb)
 		break;
 	}
 
-	if (!rt->peer)
-		rt_bind_peer(rt, rt->rt_dst, 1);
-	peer = rt->peer;
+	peer = rt_get_peer_create(rt, rt->rt_dst);
 
 	send = true;
 	if (peer) {
@@ -1754,9 +1743,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 
 	dst_confirm(dst);
 
-	if (!rt->peer)
-		rt_bind_peer(rt, rt->rt_dst, 1);
-	peer = rt->peer;
+	peer = rt_get_peer_create(rt, rt->rt_dst);
 	if (peer) {
 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
 
@@ -1782,12 +1769,8 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 static void ipv4_validate_peer(struct rtable *rt)
 {
 	if (rt->rt_peer_genid != rt_peer_genid()) {
-		struct inet_peer *peer;
-
-		if (!rt->peer)
-			rt_bind_peer(rt, rt->rt_dst, 0);
+		struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
 
-		peer = rt->peer;
 		if (peer) {
 			check_peer_pmtu(&rt->dst, peer);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f485b451f928..833e8d96a636 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1832,9 +1832,7 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
 		peer = inet_getpeer_v4(net, inet->inet_daddr, 1);
 		*release_it = true;
 	} else {
-		if (!rt->peer)
-			rt_bind_peer(rt, inet->inet_daddr, 1);
-		peer = rt->peer;
+		peer = rt_get_peer_create(rt, inet->inet_daddr);
 		*release_it = false;
 	}
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 091a2971c7b7..ed89bba745a1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -188,14 +188,14 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (!rt->rt6i_peer)
-			rt6_bind_peer(rt, 1);
-		res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo);
+		peer = rt6_get_peer_create(rt);
+		res = inet_peer_xrlim_allow(peer, tmo);
 	}
 	dst_release(dst);
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 17b8c67998bb..62fcf3e48aca 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -463,6 +463,7 @@ int ip6_forward(struct sk_buff *skb)
 	 */
 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 		struct in6_addr *target = NULL;
+		struct inet_peer *peer;
 		struct rt6_info *rt;
 
 		/*
@@ -476,13 +477,12 @@ int ip6_forward(struct sk_buff *skb)
 		else
 			target = &hdr->daddr;
 
-		if (!rt->rt6i_peer)
-			rt6_bind_peer(rt, 1);
+		peer = rt6_get_peer_create(rt);
 
 		/* Limit redirects both by destination (here)
 		   and by source (inside ndisc_send_redirect)
 		 */
-		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
+		if (inet_peer_xrlim_allow(peer, 1*HZ))
 			ndisc_send_redirect(skb, target);
 	} else {
 		int addrtype = ipv6_addr_type(&hdr->saddr);
@@ -602,11 +602,8 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 	int old, new;
 
 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
-		struct inet_peer *peer;
+		struct inet_peer *peer = rt6_get_peer_create(rt);
 
-		if (!rt->rt6i_peer)
-			rt6_bind_peer(rt, 1);
-		peer = rt->rt6i_peer;
 		if (peer) {
 			fhdr->identification = htonl(inet_getid(peer, 0));
 			return;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 54f62d3b8dd6..69a6330dea91 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1472,6 +1472,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	struct net *net = dev_net(dev);
 	struct sock *sk = net->ipv6.ndisc_sk;
 	int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+	struct inet_peer *peer;
 	struct sk_buff *buff;
 	struct icmp6hdr *icmph;
 	struct in6_addr saddr_buf;
@@ -1518,9 +1519,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 			  "Redirect: destination is not a neighbour\n");
 		goto release;
 	}
-	if (!rt->rt6i_peer)
-		rt6_bind_peer(rt, 1);
-	if (!inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
+	peer = rt6_get_peer_create(rt);
+	if (!inet_peer_xrlim_allow(peer, 1*HZ))
 		goto release;
 
 	if (dev->addr_len) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4eca0130cce7..8a986be4aeda 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -99,10 +99,7 @@ static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 	if (!(rt->dst.flags & DST_HOST))
 		return NULL;
 
-	if (!rt->rt6i_peer)
-		rt6_bind_peer(rt, 1);
-
-	peer = rt->rt6i_peer;
+	peer = rt6_get_peer_create(rt);
 	if (peer) {
 		u32 *old_p = __DST_METRICS_PTR(old);
 		unsigned long prev, new;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1a9cdd09f11c..218433cb9928 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1744,9 +1744,7 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it)
 		peer = inet_getpeer_v6(net, &np->daddr, 1);
 		*release_it = true;
 	} else {
-		if (!rt->rt6i_peer)
-			rt6_bind_peer(rt, 1);
-		peer = rt->rt6i_peer;
+		peer = rt6_get_peer_create(rt);
 		*release_it = false;
 	}
 
-- 
cgit v1.2.3


From 4670fd819e7f47392c7c6fc6168ea2857c66d163 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 01:25:47 -0700
Subject: tcp: Get rid of inetpeer special cases.

The get_peer method TCP uses is full of special cases that make no
sense accommodating, and it also gets in the way of doing more
reasonable things here.

First of all, if the socket doesn't have a usable cached route, there
is no sense in trying to optimize timewait recycling.

Likewise for the case where we have IP options, such as SRR enabled,
that make the IP header destination address (and thus the destination
address of the route key) differ from that of the connection's
destination address.

Just return a NULL peer in these cases, and thus we're also able to
get rid of the clumsy inetpeer release logic.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  2 +-
 include/net/tcp.h                  |  2 +-
 net/ipv4/tcp_ipv4.c                | 21 ++++++++-------------
 net/ipv4/tcp_minisocks.c           |  5 +----
 net/ipv6/tcp_ipv6.c                | 21 ++++++++-------------
 5 files changed, 19 insertions(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7d83f90f203f..e1b7734c456f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,7 @@ struct inet_connection_sock_af_ops {
 	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req,
 				      struct dst_entry *dst);
-	struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it);
+	struct inet_peer *(*get_peer)(struct sock *sk);
 	u16	    net_header_len;
 	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e79aa48d9fc1..424591866037 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,7 +327,7 @@ extern void tcp_shutdown (struct sock *sk, int how);
 
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
-extern struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it);
+extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
 extern void *tcp_v4_tw_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 833e8d96a636..77f049d00dbb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1820,23 +1820,18 @@ do_time_wait:
 	goto discard_it;
 }
 
-struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
+struct inet_peer *tcp_v4_get_peer(struct sock *sk)
 {
 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
 	struct inet_sock *inet = inet_sk(sk);
-	struct net *net = sock_net(sk);
-	struct inet_peer *peer;
-
-	if (!rt ||
-	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
-		peer = inet_getpeer_v4(net, inet->inet_daddr, 1);
-		*release_it = true;
-	} else {
-		peer = rt_get_peer_create(rt, inet->inet_daddr);
-		*release_it = false;
-	}
 
-	return peer;
+	/* If we don't have a valid cached route, or we're doing IP
+	 * options which make the IPv4 header destination address
+	 * different from our peer's, do not bother with this.
+	 */
+	if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
+		return NULL;
+	return rt_get_peer_create(rt, inet->inet_daddr);
 }
 EXPORT_SYMBOL(tcp_v4_get_peer);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b85d9fe7d663..fef9dbf3af00 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -60,9 +60,8 @@ static bool tcp_remember_stamp(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_peer *peer;
-	bool release_it;
 
-	peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
+	peer = icsk->icsk_af_ops->get_peer(sk);
 	if (peer) {
 		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
@@ -70,8 +69,6 @@ static bool tcp_remember_stamp(struct sock *sk)
 			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
 			peer->tcp_ts = tp->rx_opt.ts_recent;
 		}
-		if (release_it)
-			inet_putpeer(peer);
 		return true;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 218433cb9928..b5ecf37b61a6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1732,23 +1732,18 @@ do_time_wait:
 	goto discard_it;
 }
 
-static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it)
+static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
 {
 	struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct net *net = sock_net(sk);
-	struct inet_peer *peer;
-
-	if (!rt ||
-	    !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) {
-		peer = inet_getpeer_v6(net, &np->daddr, 1);
-		*release_it = true;
-	} else {
-		peer = rt6_get_peer_create(rt);
-		*release_it = false;
-	}
 
-	return peer;
+	/* If we don't have a valid cached route, or we're doing IP
+	 * options which make the IPv6 header destination address
+	 * different from our peer's, do not bother with this.
+	 */
+	if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr))
+		return NULL;
+	return rt6_get_peer_create(rt);
 }
 
 static void *tcp_v6_tw_get_peer(struct sock *sk)
-- 
cgit v1.2.3


From 2397849baa7c44c242e5d5142d5d16d1e7ed53d0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 14:56:12 -0700
Subject: [PATCH] tcp: Cache inetpeer in timewait socket, and only when
 necessary.

Since it's guarenteed that we will access the inetpeer if we're trying
to do timewait recycling and TCP options were enabled on the
connection, just cache the peer in the timewait socket.

In the future, inetpeer lookups will be context dependent (per routing
realm), and this helps facilitate that as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h         |  3 ++-
 include/net/tcp.h           |  1 -
 include/net/timewait_sock.h |  8 --------
 net/ipv4/tcp_ipv4.c         | 10 ----------
 net/ipv4/tcp_minisocks.c    | 27 ++++++++++++++++++++-------
 net/ipv6/tcp_ipv6.c         | 13 -------------
 6 files changed, 22 insertions(+), 40 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4c5b63283377..23e8234f75a5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,8 +506,9 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_recent;
 	long			  tw_ts_recent_stamp;
+	struct inet_peer	  *tw_peer;
 #ifdef CONFIG_TCP_MD5SIG
-	struct tcp_md5sig_key	*tw_md5_key;
+	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
 	/* Few sockets in timewait have cookies; in that case, then this
 	 * object holds a reference to them (tw_cookie_values->kref).
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 424591866037..9332f342259a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -328,7 +328,6 @@ extern void tcp_shutdown (struct sock *sk, int how);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
-extern void *tcp_v4_tw_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 8d6689cb2c66..68f0ecad6c6e 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -22,7 +22,6 @@ struct timewait_sock_ops {
 	int		(*twsk_unique)(struct sock *sk,
 				       struct sock *sktw, void *twp);
 	void		(*twsk_destructor)(struct sock *sk);
-	void		*(*twsk_getpeer)(struct sock *sk);
 };
 
 static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -41,11 +40,4 @@ static inline void twsk_destructor(struct sock *sk)
 		sk->sk_prot->twsk_prot->twsk_destructor(sk);
 }
 
-static inline void *twsk_getpeer(struct sock *sk)
-{
-	if (sk->sk_prot->twsk_prot->twsk_getpeer)
-		return sk->sk_prot->twsk_prot->twsk_getpeer(sk);
-	return NULL;
-}
-
 #endif /* _TIMEWAIT_SOCK_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77f049d00dbb..fda2ca17135e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1835,20 +1835,10 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_v4_get_peer);
 
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
-	const struct inet_timewait_sock *tw = inet_twsk(sk);
-	struct net *net = sock_net(sk);
-
-	return inet_getpeer_v4(net, tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
-
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor= tcp_twsk_destructor,
-	.twsk_getpeer	= tcp_v4_tw_get_peer,
 };
 
 const struct inet_connection_sock_af_ops ipv4_specific = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index fef9dbf3af00..cb015317c9f7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -77,20 +77,19 @@ static bool tcp_remember_stamp(struct sock *sk)
 
 static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
+	const struct tcp_timewait_sock *tcptw;
 	struct sock *sk = (struct sock *) tw;
 	struct inet_peer *peer;
 
-	peer = twsk_getpeer(sk);
+	tcptw = tcp_twsk(sk);
+	peer = tcptw->tw_peer;
 	if (peer) {
-		const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-
 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
 		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
 			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
 			peer->tcp_ts	   = tcptw->tw_ts_recent;
 		}
-		inet_putpeer(peer);
 		return true;
 	}
 	return false;
@@ -314,9 +313,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	bool recycle_ok = false;
+	bool recycle_on = false;
 
-	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) {
 		recycle_ok = tcp_remember_stamp(sk);
+		recycle_on = true;
+	}
 
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
@@ -324,8 +326,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	if (tw != NULL) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+		struct inet_sock *inet = inet_sk(sk);
+		struct inet_peer *peer = NULL;
 
-		tw->tw_transparent	= inet_sk(sk)->transparent;
+		tw->tw_transparent	= inet->transparent;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
@@ -347,6 +351,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		}
 #endif
 
+		if (recycle_on)
+			peer = icsk->icsk_af_ops->get_peer(sk);
+		tcptw->tw_peer = peer;
+		if (peer)
+			atomic_inc(&peer->refcnt);
+
 #ifdef CONFIG_TCP_MD5SIG
 		/*
 		 * The timewait bucket does not have the key DB from the
@@ -398,8 +408,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 
 void tcp_twsk_destructor(struct sock *sk)
 {
-#ifdef CONFIG_TCP_MD5SIG
 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+	if (twsk->tw_peer)
+		inet_putpeer(twsk->tw_peer);
+#ifdef CONFIG_TCP_MD5SIG
 	if (twsk->tw_md5_key) {
 		tcp_free_md5sig_pool();
 		kfree_rcu(twsk->tw_md5_key, rcu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b5ecf37b61a6..f91b0bfd12d5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,23 +1746,10 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
 	return rt6_get_peer_create(rt);
 }
 
-static void *tcp_v6_tw_get_peer(struct sock *sk)
-{
-	const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
-	const struct inet_timewait_sock *tw = inet_twsk(sk);
-	struct net *net = sock_net(sk);
-
-	if (tw->tw_family == AF_INET)
-		return tcp_v4_tw_get_peer(sk);
-
-	return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1);
-}
-
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor= tcp_twsk_destructor,
-	.twsk_getpeer	= tcp_v6_tw_get_peer,
 };
 
 static const struct inet_connection_sock_af_ops ipv6_specific = {
-- 
cgit v1.2.3


From c3426b47190d7c6785230c91a706fd42208b4120 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 16:27:05 -0700
Subject: inet: Initialize per-netns inetpeer roots in net/ipv{4,6}/route.c

Instead of net/ipv4/inetpeer.c

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  9 +++++++
 net/ipv4/inetpeer.c    | 64 +++++++++++++-------------------------------------
 net/ipv4/route.c       | 25 ++++++++++++++++++++
 net/ipv6/route.c       | 34 ++++++++++++++++++++++++++-
 4 files changed, 83 insertions(+), 49 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 20e67db69ac9..5a96c8edc526 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -65,6 +65,14 @@ struct inet_peer {
 	atomic_t		refcnt;
 };
 
+struct inet_peer_base {
+	struct inet_peer __rcu	*root;
+	seqlock_t		lock;
+	int			total;
+};
+
+extern void inet_peer_base_init(struct inet_peer_base *);
+
 void			inet_initpeers(void) __init;
 
 #define INETPEER_METRICS_NEW	(~(u32) 0)
@@ -105,6 +113,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct net *net,
 extern void inet_putpeer(struct inet_peer *p);
 extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
+extern void __inetpeer_invalidate_tree(struct inet_peer_base *);
 extern void inetpeer_invalidate_tree(struct net *net, int family);
 
 /*
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 1c8527349c86..d0d72f89b279 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -82,11 +82,13 @@ static const struct inet_peer peer_fake_node = {
 	.avl_height	= 0
 };
 
-struct inet_peer_base {
-	struct inet_peer __rcu *root;
-	seqlock_t	lock;
-	int		total;
-};
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+	bp->root = peer_avl_empty_rcu;
+	seqlock_init(&bp->lock);
+	bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
 
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 
@@ -141,46 +143,6 @@ static void inetpeer_gc_worker(struct work_struct *work)
 	schedule_delayed_work(&gc_work, gc_delay);
 }
 
-static int __net_init inetpeer_net_init(struct net *net)
-{
-	net->ipv4.peers = kzalloc(sizeof(struct inet_peer_base),
-				  GFP_KERNEL);
-	if (net->ipv4.peers == NULL)
-		return -ENOMEM;
-
-	net->ipv4.peers->root = peer_avl_empty_rcu;
-	seqlock_init(&net->ipv4.peers->lock);
-
-	net->ipv6.peers = kzalloc(sizeof(struct inet_peer_base),
-				  GFP_KERNEL);
-	if (net->ipv6.peers == NULL)
-		goto out_ipv6;
-
-	net->ipv6.peers->root = peer_avl_empty_rcu;
-	seqlock_init(&net->ipv6.peers->lock);
-
-	return 0;
-out_ipv6:
-	kfree(net->ipv4.peers);
-	return -ENOMEM;
-}
-
-static void __net_exit inetpeer_net_exit(struct net *net)
-{
-	inetpeer_invalidate_tree(net, AF_INET);
-	kfree(net->ipv4.peers);
-	net->ipv4.peers = NULL;
-
-	inetpeer_invalidate_tree(net, AF_INET6);
-	kfree(net->ipv6.peers);
-	net->ipv6.peers = NULL;
-}
-
-static struct pernet_operations inetpeer_ops = {
-	.init = inetpeer_net_init,
-	.exit = inetpeer_net_exit,
-};
-
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
 {
@@ -205,7 +167,6 @@ void __init inet_initpeers(void)
 			NULL);
 
 	INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
-	register_pernet_subsys(&inetpeer_ops);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
@@ -603,10 +564,9 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
 	schedule_delayed_work(&gc_work, gc_delay);
 }
 
-void inetpeer_invalidate_tree(struct net *net, int family)
+void __inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
 	struct inet_peer *old, *new, *prev;
-	struct inet_peer_base *base = family_to_base(net, family);
 
 	write_seqlock_bh(&base->lock);
 
@@ -625,4 +585,12 @@ void inetpeer_invalidate_tree(struct net *net, int family)
 out:
 	write_sequnlock_bh(&base->lock);
 }
+EXPORT_SYMBOL(__inetpeer_invalidate_tree);
+
+void inetpeer_invalidate_tree(struct net *net, int family)
+{
+	struct inet_peer_base *base = family_to_base(net, family);
+
+	__inetpeer_invalidate_tree(base);
+}
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7a4d724765e6..4cd35c3904d4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3385,6 +3385,30 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
 	.init = rt_genid_init,
 };
 
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+	if (!bp)
+		return -ENOMEM;
+	inet_peer_base_init(bp);
+	net->ipv4.peers = bp;
+	return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+	struct inet_peer_base *bp = net->ipv4.peers;
+
+	net->ipv4.peers = NULL;
+	__inetpeer_invalidate_tree(bp);
+	kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+	.init	=	ipv4_inetpeer_init,
+	.exit	=	ipv4_inetpeer_exit,
+};
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
@@ -3465,6 +3489,7 @@ int __init ip_rt_init(void)
 	register_pernet_subsys(&sysctl_route_ops);
 #endif
 	register_pernet_subsys(&rt_genid_ops);
+	register_pernet_subsys(&ipv4_inetpeer_ops);
 	return rc;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8a986be4aeda..3e1e4e0da096 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2996,6 +2996,31 @@ static struct pernet_operations ip6_route_net_ops = {
 	.exit = ip6_route_net_exit,
 };
 
+static int __net_init ipv6_inetpeer_init(struct net *net)
+{
+	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+	if (!bp)
+		return -ENOMEM;
+	inet_peer_base_init(bp);
+	net->ipv6.peers = bp;
+	return 0;
+}
+
+static void __net_exit ipv6_inetpeer_exit(struct net *net)
+{
+	struct inet_peer_base *bp = net->ipv6.peers;
+
+	net->ipv6.peers = NULL;
+	__inetpeer_invalidate_tree(bp);
+	kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv6_inetpeer_ops = {
+	.init	=	ipv6_inetpeer_init,
+	.exit	=	ipv6_inetpeer_exit,
+};
+
 static struct notifier_block ip6_route_dev_notifier = {
 	.notifier_call = ip6_route_dev_notify,
 	.priority = 0,
@@ -3020,6 +3045,10 @@ int __init ip6_route_init(void)
 	if (ret)
 		goto out_dst_entries;
 
+	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
+	if (ret)
+		goto out_register_subsys;
+
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
 
 	/* Registering of the loopback is done before this portion of code,
@@ -3035,7 +3064,7 @@ int __init ip6_route_init(void)
   #endif
 	ret = fib6_init();
 	if (ret)
-		goto out_register_subsys;
+		goto out_register_inetpeer;
 
 	ret = xfrm6_init();
 	if (ret)
@@ -3064,6 +3093,8 @@ xfrm6_init:
 	xfrm6_fini();
 out_fib6_init:
 	fib6_gc_cleanup();
+out_register_inetpeer:
+	unregister_pernet_subsys(&ipv6_inetpeer_ops);
 out_register_subsys:
 	unregister_pernet_subsys(&ip6_route_net_ops);
 out_dst_entries:
@@ -3079,6 +3110,7 @@ void ip6_route_cleanup(void)
 	fib6_rules_cleanup();
 	xfrm6_fini();
 	fib6_gc_cleanup();
+	unregister_pernet_subsys(&ipv6_inetpeer_ops);
 	unregister_pernet_subsys(&ip6_route_net_ops);
 	dst_entries_destroy(&ip6_dst_blackhole_ops);
 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
-- 
cgit v1.2.3


From 56a6b248eb345c1948ee60bf426de1ff7dd81509 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 16:32:41 -0700
Subject: inet: Consolidate inetpeer_invalidate_tree() interfaces.

We only need one interface for this operation, since we always know
which inetpeer root we want to flush.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  3 +--
 net/ipv4/inetpeer.c    | 10 +---------
 net/ipv4/route.c       |  4 ++--
 net/ipv6/route.c       |  2 +-
 4 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 5a96c8edc526..733edc641b76 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -113,8 +113,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct net *net,
 extern void inet_putpeer(struct inet_peer *p);
 extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
-extern void __inetpeer_invalidate_tree(struct inet_peer_base *);
-extern void inetpeer_invalidate_tree(struct net *net, int family);
+extern void inetpeer_invalidate_tree(struct inet_peer_base *);
 
 /*
  * temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d0d72f89b279..9d89a381f0e1 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -564,7 +564,7 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
 	schedule_delayed_work(&gc_work, gc_delay);
 }
 
-void __inetpeer_invalidate_tree(struct inet_peer_base *base)
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
 	struct inet_peer *old, *new, *prev;
 
@@ -585,12 +585,4 @@ void __inetpeer_invalidate_tree(struct inet_peer_base *base)
 out:
 	write_sequnlock_bh(&base->lock);
 }
-EXPORT_SYMBOL(__inetpeer_invalidate_tree);
-
-void inetpeer_invalidate_tree(struct net *net, int family)
-{
-	struct inet_peer_base *base = family_to_base(net, family);
-
-	__inetpeer_invalidate_tree(base);
-}
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4cd35c3904d4..cf78343940de 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -935,7 +935,7 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	inetpeer_invalidate_tree(net, AF_INET);
+	inetpeer_invalidate_tree(net->ipv4.peers);
 }
 
 /*
@@ -3401,7 +3401,7 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net)
 	struct inet_peer_base *bp = net->ipv4.peers;
 
 	net->ipv4.peers = NULL;
-	__inetpeer_invalidate_tree(bp);
+	inetpeer_invalidate_tree(bp);
 	kfree(bp);
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3e1e4e0da096..7f346d7492d2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3012,7 +3012,7 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net)
 	struct inet_peer_base *bp = net->ipv6.peers;
 
 	net->ipv6.peers = NULL;
-	__inetpeer_invalidate_tree(bp);
+	inetpeer_invalidate_tree(bp);
 	kfree(bp);
 }
 
-- 
cgit v1.2.3


From c0efc887dcadbdbfe171f028acfab9c7c00e9dde Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 19:12:36 -0700
Subject: inet: Pass inetpeer root into inet_getpeer*() interfaces.

Otherwise we reference potentially non-existing members when
ipv6 is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h | 10 +++++-----
 net/ipv4/inetpeer.c    |  9 +--------
 net/ipv4/ip_fragment.c |  2 +-
 net/ipv4/route.c       |  6 +++---
 net/ipv6/route.c       |  2 +-
 5 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 733edc641b76..b84b32fd5df1 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -83,11 +83,11 @@ static inline bool inet_metrics_new(const struct inet_peer *p)
 }
 
 /* can be called with or without local BH being disabled */
-struct inet_peer *inet_getpeer(struct net *net,
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
 			       int create);
 
-static inline struct inet_peer *inet_getpeer_v4(struct net *net,
+static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
 						__be32 v4daddr,
 						int create)
 {
@@ -95,10 +95,10 @@ static inline struct inet_peer *inet_getpeer_v4(struct net *net,
 
 	daddr.addr.a4 = v4daddr;
 	daddr.family = AF_INET;
-	return inet_getpeer(net, &daddr, create);
+	return inet_getpeer(base, &daddr, create);
 }
 
-static inline struct inet_peer *inet_getpeer_v6(struct net *net,
+static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
 						const struct in6_addr *v6daddr,
 						int create)
 {
@@ -106,7 +106,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct net *net,
 
 	*(struct in6_addr *)daddr.addr.a6 = *v6daddr;
 	daddr.family = AF_INET6;
-	return inet_getpeer(net, &daddr, create);
+	return inet_getpeer(base, &daddr, create);
 }
 
 /* can be called from BH context or outside */
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9d89a381f0e1..e4cba56a5349 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -391,12 +391,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
 	call_rcu(&p->rcu, inetpeer_free_rcu);
 }
 
-static struct inet_peer_base *family_to_base(struct net *net,
-					     int family)
-{
-	return family == AF_INET ? net->ipv4.peers : net->ipv6.peers;
-}
-
 /* perform garbage collect on all items stacked during a lookup */
 static int inet_peer_gc(struct inet_peer_base *base,
 			struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -434,12 +428,11 @@ static int inet_peer_gc(struct inet_peer_base *base,
 	return cnt;
 }
 
-struct inet_peer *inet_getpeer(struct net *net,
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
 			       int create)
 {
 	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
-	struct inet_peer_base *base = family_to_base(net, daddr->family);
 	struct inet_peer *p;
 	unsigned int sequence;
 	int invalidated, gccnt = 0;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 22c6bab9717a..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -184,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
 	qp->peer = sysctl_ipfrag_max_dist ?
-		inet_getpeer_v4(net, arg->iph->saddr, 1) : NULL;
+		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
 }
 
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cf78343940de..2aa663a6ae9e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1328,7 +1328,7 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 	struct net *net = dev_net(rt->dst.dev);
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v4(net, daddr, create);
+	peer = inet_getpeer_v4(net->ipv4.peers, daddr, create);
 
 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
@@ -1684,7 +1684,7 @@ unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
 	unsigned short est_mtu = 0;
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v4(net, iph->daddr, 1);
+	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 	if (peer) {
 		unsigned short mtu = new_mtu;
 
@@ -1929,7 +1929,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
 		create = 1;
 
-	rt->peer = peer = inet_getpeer_v4(net, rt->rt_dst, create);
+	rt->peer = peer = inet_getpeer_v4(net->ipv4.peers, rt->rt_dst, create);
 	if (peer) {
 		rt->rt_peer_genid = rt_peer_genid();
 		if (inet_metrics_new(peer))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9586c27e069c..8fc41d502bbd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -306,7 +306,7 @@ void rt6_bind_peer(struct rt6_info *rt, int create)
 	struct net *net = dev_net(rt->dst.dev);
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create);
+	peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, create);
 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
 	else
-- 
cgit v1.2.3


From 97bab73f987e2781129cd6f4b6379bf44d808cc6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 22:36:36 -0700
Subject: inet: Hide route peer accesses behind helpers.

We encode the pointer(s) into an unsigned long with one state bit.

The state bit is used so we can store the inetpeer tree root to use
when resolving the peer later.

Later the peer roots will be per-FIB table, and this change works to
facilitate that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h  | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/net/ip6_fib.h   | 32 +++++++++++++++++++++++++++-
 include/net/ip6_route.h |  6 +++---
 include/net/route.h     | 42 +++++++++++++++++++++++++++++++++----
 net/ipv4/route.c        | 56 +++++++++++++++++++++++++++++--------------------
 net/ipv4/xfrm4_policy.c | 10 ++++-----
 net/ipv6/route.c        | 42 +++++++++++++++++++++----------------
 net/ipv6/xfrm6_policy.c | 10 ++++-----
 8 files changed, 193 insertions(+), 59 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index b84b32fd5df1..d432489e7109 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -71,6 +71,60 @@ struct inet_peer_base {
 	int			total;
 };
 
+#define INETPEER_BASE_BIT	0x1UL
+
+static inline struct inet_peer *inetpeer_ptr(unsigned long val)
+{
+	BUG_ON(val & INETPEER_BASE_BIT);
+	return (struct inet_peer *) val;
+}
+
+static inline struct inet_peer_base *inetpeer_base_ptr(unsigned long val)
+{
+	if (!(val & INETPEER_BASE_BIT))
+		return NULL;
+	val &= ~INETPEER_BASE_BIT;
+	return (struct inet_peer_base *) val;
+}
+
+static inline bool inetpeer_ptr_is_peer(unsigned long val)
+{
+	return !(val & INETPEER_BASE_BIT);
+}
+
+static inline void __inetpeer_ptr_set_peer(unsigned long *val, struct inet_peer *peer)
+{
+	/* This implicitly clears INETPEER_BASE_BIT */
+	*val = (unsigned long) peer;
+}
+
+static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *peer)
+{
+	unsigned long val = (unsigned long) peer;
+	unsigned long orig = *ptr;
+
+	if (!(orig & INETPEER_BASE_BIT) || !val ||
+	    cmpxchg(ptr, orig, val) != orig)
+		return false;
+	return true;
+}
+
+static inline void inetpeer_init_ptr(unsigned long *ptr, struct inet_peer_base *base)
+{
+	*ptr = (unsigned long) base | INETPEER_BASE_BIT;
+}
+
+static inline void inetpeer_transfer_peer(unsigned long *to, unsigned long *from)
+{
+	unsigned long val = *from;
+
+	*to = val;
+	if (inetpeer_ptr_is_peer(val)) {
+		struct inet_peer *peer = inetpeer_ptr(val);
+		atomic_inc(&peer->refcnt);
+	}
+}
+
 extern void inet_peer_base_init(struct inet_peer_base *);
 
 void			inet_initpeers(void) __init;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0ae759a6c76e..3ac5f155c690 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -107,7 +107,7 @@ struct rt6_info {
 	u32				rt6i_peer_genid;
 
 	struct inet6_dev		*rt6i_idev;
-	struct inet_peer		*rt6i_peer;
+	unsigned long			_rt6i_peer;
 
 #ifdef CONFIG_XFRM
 	u32				rt6i_flow_cache_genid;
@@ -118,6 +118,36 @@ struct rt6_info {
 	u8				rt6i_protocol;
 };
 
+static inline struct inet_peer *rt6_peer_ptr(struct rt6_info *rt)
+{
+	return inetpeer_ptr(rt->_rt6i_peer);
+}
+
+static inline bool rt6_has_peer(struct rt6_info *rt)
+{
+	return inetpeer_ptr_is_peer(rt->_rt6i_peer);
+}
+
+static inline void __rt6_set_peer(struct rt6_info *rt, struct inet_peer *peer)
+{
+	__inetpeer_ptr_set_peer(&rt->_rt6i_peer, peer);
+}
+
+static inline bool rt6_set_peer(struct rt6_info *rt, struct inet_peer *peer)
+{
+	return inetpeer_ptr_set_peer(&rt->_rt6i_peer, peer);
+}
+
+static inline void rt6_init_peer(struct rt6_info *rt, struct inet_peer_base *base)
+{
+	inetpeer_init_ptr(&rt->_rt6i_peer, base);
+}
+
+static inline void rt6_transfer_peer(struct rt6_info *rt, struct rt6_info *ort)
+{
+	inetpeer_transfer_peer(&rt->_rt6i_peer, &ort->_rt6i_peer);
+}
+
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
 	return ((struct rt6_info *)dst)->rt6i_idev;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 73d750288121..f88a85cf31c3 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -57,11 +57,11 @@ extern void rt6_bind_peer(struct rt6_info *rt, int create);
 
 static inline struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
 {
-	if (rt->rt6i_peer)
-		return rt->rt6i_peer;
+	if (rt6_has_peer(rt))
+		return rt6_peer_ptr(rt);
 
 	rt6_bind_peer(rt, create);
-	return rt->rt6i_peer;
+	return rt6_peer_ptr(rt);
 }
 
 static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt)
diff --git a/include/net/route.h b/include/net/route.h
index 433fc6c1d404..6340c37677fc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -67,10 +67,44 @@ struct rtable {
 	/* Miscellaneous cached information */
 	__be32			rt_spec_dst; /* RFC1122 specific destination */
 	u32			rt_peer_genid;
-	struct inet_peer	*peer; /* long-living peer info */
+	unsigned long		_peer; /* long-living peer info */
 	struct fib_info		*fi; /* for client ref to shared metrics */
 };
 
+static inline struct inet_peer *rt_peer_ptr(struct rtable *rt)
+{
+	return inetpeer_ptr(rt->_peer);
+}
+
+static inline bool rt_has_peer(struct rtable *rt)
+{
+	return inetpeer_ptr_is_peer(rt->_peer);
+}
+
+static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer)
+{
+	__inetpeer_ptr_set_peer(&rt->_peer, peer);
+}
+
+static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer)
+{
+	return inetpeer_ptr_set_peer(&rt->_peer, peer);
+}
+
+static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base)
+{
+	inetpeer_init_ptr(&rt->_peer, base);
+}
+
+static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort)
+{
+	rt->_peer = ort->_peer;
+	if (rt_has_peer(ort)) {
+		struct inet_peer *peer = rt_peer_ptr(ort);
+		atomic_inc(&peer->refcnt);
+	}
+}
+
 static inline bool rt_is_input_route(const struct rtable *rt)
 {
 	return rt->rt_route_iif != 0;
@@ -298,11 +332,11 @@ extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
 
 static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
 {
-	if (rt->peer)
-		return rt->peer;
+	if (rt_has_peer(rt))
+		return rt_peer_ptr(rt);
 
 	rt_bind_peer(rt, daddr, create);
-	return rt->peer;
+	return rt_peer_ptr(rt);
 }
 
 static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2aa663a6ae9e..03e5b614370e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -677,7 +677,7 @@ static inline int rt_fast_clean(struct rtable *rth)
 static inline int rt_valuable(struct rtable *rth)
 {
 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		(rth->peer && rth->peer->pmtu_expires);
+		(rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 }
 
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -1325,12 +1325,16 @@ static u32 rt_peer_genid(void)
 
 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 {
-	struct net *net = dev_net(rt->dst.dev);
+	struct inet_peer_base *base;
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v4(net->ipv4.peers, daddr, create);
+	base = inetpeer_base_ptr(rt->_peer);
+	if (!base)
+		return;
+
+	peer = inet_getpeer_v4(base, daddr, create);
 
-	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
+	if (!rt_set_peer(rt, peer))
 		inet_putpeer(peer);
 	else
 		rt->rt_peer_genid = rt_peer_genid();
@@ -1533,8 +1537,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 						rt_genid(dev_net(dst->dev)));
 			rt_del(hash, rt);
 			ret = NULL;
-		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
-			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
+		} else if (rt_has_peer(rt)) {
+			struct inet_peer *peer = rt_peer_ptr(rt);
+			if (peer_pmtu_expired(peer))
+				dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
 		}
 	}
 	return ret;
@@ -1796,14 +1802,13 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 static void ipv4_dst_destroy(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *) dst;
-	struct inet_peer *peer = rt->peer;
 
 	if (rt->fi) {
 		fib_info_put(rt->fi);
 		rt->fi = NULL;
 	}
-	if (peer) {
-		rt->peer = NULL;
+	if (rt_has_peer(rt)) {
+		struct inet_peer *peer = rt_peer_ptr(rt);
 		inet_putpeer(peer);
 	}
 }
@@ -1816,8 +1821,11 @@ static void ipv4_link_failure(struct sk_buff *skb)
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
 	rt = skb_rtable(skb);
-	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
-		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+	if (rt && rt_has_peer(rt)) {
+		struct inet_peer *peer = rt_peer_ptr(rt);
+		if (peer_pmtu_cleaned(peer))
+			dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
+	}
 }
 
 static int ip_rt_bug(struct sk_buff *skb)
@@ -1919,7 +1927,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 			    struct fib_info *fi)
 {
-	struct net *net = dev_net(rt->dst.dev);
+	struct inet_peer_base *base;
 	struct inet_peer *peer;
 	int create = 0;
 
@@ -1929,8 +1937,12 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
 		create = 1;
 
-	rt->peer = peer = inet_getpeer_v4(net->ipv4.peers, rt->rt_dst, create);
+	base = inetpeer_base_ptr(rt->_peer);
+	BUG_ON(!base);
+
+	peer = inet_getpeer_v4(base, rt->rt_dst, create);
 	if (peer) {
+		__rt_set_peer(rt, peer);
 		rt->rt_peer_genid = rt_peer_genid();
 		if (inet_metrics_new(peer))
 			memcpy(peer->metrics, fi->fib_metrics,
@@ -2046,7 +2058,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
-	rth->peer = NULL;
+	rt_init_peer(rth, dev_net(dev)->ipv4.peers);
 	rth->fi = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
@@ -2174,7 +2186,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
-	rth->peer = NULL;
+	rt_init_peer(rth, dev_net(rth->dst.dev)->ipv4.peers);
 	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
@@ -2357,7 +2369,7 @@ local_input:
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
-	rth->peer = NULL;
+	rt_init_peer(rth, net->ipv4.peers);
 	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -2561,7 +2573,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_gateway = fl4->daddr;
 	rth->rt_spec_dst= fl4->saddr;
 	rth->rt_peer_genid = 0;
-	rth->peer = NULL;
+	rt_init_peer(rth, dev_net(dev_out)->ipv4.peers);
 	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2898,9 +2910,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->rt_spec_dst = ort->rt_spec_dst;
-		rt->peer = ort->peer;
-		if (rt->peer)
-			atomic_inc(&rt->peer->refcnt);
+		rt_transfer_peer(rt, ort);
 		rt->fi = ort->fi;
 		if (rt->fi)
 			atomic_inc(&rt->fi->fib_clntref);
@@ -2938,7 +2948,6 @@ static int rt_fill_info(struct net *net,
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
-	const struct inet_peer *peer = rt->peer;
 	u32 id = 0, ts = 0, tsage = 0, error;
 
 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
@@ -2994,8 +3003,9 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 
 	error = rt->dst.error;
-	if (peer) {
-		inet_peer_refcheck(rt->peer);
+	if (rt_has_peer(rt)) {
+		const struct inet_peer *peer = rt_peer_ptr(rt);
+		inet_peer_refcheck(peer);
 		id = atomic_read(&peer->ip_id_count) & 0xffff;
 		if (peer->tcp_ts_stamp) {
 			ts = peer->tcp_ts;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 0d3426cb5c4f..8855d8268552 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,9 +90,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
 
-	xdst->u.rt.peer = rt->peer;
-	if (rt->peer)
-		atomic_inc(&rt->peer->refcnt);
+	rt_transfer_peer(&xdst->u.rt, rt);
 
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
@@ -212,8 +210,10 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 
 	dst_destroy_metrics_generic(dst);
 
-	if (likely(xdst->u.rt.peer))
-		inet_putpeer(xdst->u.rt.peer);
+	if (rt_has_peer(&xdst->u.rt)) {
+		struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt);
+		inet_putpeer(peer);
+	}
 
 	xfrm_dst_destroy(xdst);
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8fc41d502bbd..17a9b8687f29 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -258,16 +258,18 @@ static struct rt6_info ip6_blk_hole_entry_template = {
 #endif
 
 /* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
+static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 					     struct net_device *dev,
 					     int flags)
 {
-	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
+	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
+					0, 0, flags);
 
-	if (rt)
+	if (rt) {
 		memset(&rt->rt6i_table, 0,
 		       sizeof(*rt) - sizeof(struct dst_entry));
-
+		rt6_init_peer(rt, net->ipv6.peers);
+	}
 	return rt;
 }
 
@@ -275,7 +277,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
 	struct inet6_dev *idev = rt->rt6i_idev;
-	struct inet_peer *peer = rt->rt6i_peer;
 
 	if (!(rt->dst.flags & DST_HOST))
 		dst_destroy_metrics_generic(dst);
@@ -288,8 +289,8 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 	if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
 		dst_release(dst->from);
 
-	if (peer) {
-		rt->rt6i_peer = NULL;
+	if (rt6_has_peer(rt)) {
+		struct inet_peer *peer = rt6_peer_ptr(rt);
 		inet_putpeer(peer);
 	}
 }
@@ -303,11 +304,15 @@ static u32 rt6_peer_genid(void)
 
 void rt6_bind_peer(struct rt6_info *rt, int create)
 {
-	struct net *net = dev_net(rt->dst.dev);
+	struct inet_peer_base *base;
 	struct inet_peer *peer;
 
-	peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, create);
-	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
+	base = inetpeer_base_ptr(rt->_rt6i_peer);
+	if (!base)
+		return;
+
+	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
+	if (!rt6_set_peer(rt, peer))
 		inet_putpeer(peer);
 	else
 		rt->rt6i_peer_genid = rt6_peer_genid();
@@ -950,6 +955,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 	if (rt) {
 		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
+		rt6_init_peer(rt, net->ipv6.peers);
 
 		new = &rt->dst;
 
@@ -994,7 +1000,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
-			if (!rt->rt6i_peer)
+			if (!rt6_has_peer(rt))
 				rt6_bind_peer(rt, 0);
 			rt->rt6i_peer_genid = rt6_peer_genid();
 		}
@@ -1108,7 +1114,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	if (unlikely(!idev))
 		return ERR_PTR(-ENODEV);
 
-	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
+	rt = ip6_dst_alloc(net, dev, 0);
 	if (unlikely(!rt)) {
 		in6_dev_put(idev);
 		dst = ERR_PTR(-ENOMEM);
@@ -1290,7 +1296,7 @@ int ip6_route_add(struct fib6_config *cfg)
 	if (!table)
 		goto out;
 
-	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
+	rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT);
 
 	if (!rt) {
 		err = -ENOMEM;
@@ -1812,8 +1818,7 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 				    const struct in6_addr *dest)
 {
 	struct net *net = dev_net(ort->dst.dev);
-	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
-					    ort->dst.dev, 0);
+	struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0);
 
 	if (rt) {
 		rt->dst.input = ort->dst.input;
@@ -2097,8 +2102,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 				    bool anycast)
 {
 	struct net *net = dev_net(idev->dev);
-	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
-					    net->loopback_dev, 0);
+	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0);
 	int err;
 
 	if (!rt) {
@@ -2519,7 +2523,9 @@ static int rt6_fill_node(struct net *net,
 	else
 		expires = INT_MAX;
 
-	peer = rt->rt6i_peer;
+	peer = NULL;
+	if (rt6_has_peer(rt))
+		peer = rt6_peer_ptr(rt);
 	ts = tsage = 0;
 	if (peer && peer->tcp_ts_stamp) {
 		ts = peer->tcp_ts;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 8625fba96db9..d7494845efbf 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -99,9 +99,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	if (!xdst->u.rt6.rt6i_idev)
 		return -ENODEV;
 
-	xdst->u.rt6.rt6i_peer = rt->rt6i_peer;
-	if (rt->rt6i_peer)
-		atomic_inc(&rt->rt6i_peer->refcnt);
+	rt6_transfer_peer(&xdst->u.rt6, rt);
 
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
@@ -223,8 +221,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
 	if (likely(xdst->u.rt6.rt6i_idev))
 		in6_dev_put(xdst->u.rt6.rt6i_idev);
 	dst_destroy_metrics_generic(dst);
-	if (likely(xdst->u.rt6.rt6i_peer))
-		inet_putpeer(xdst->u.rt6.rt6i_peer);
+	if (rt6_has_peer(&xdst->u.rt6)) {
+		struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6);
+		inet_putpeer(peer);
+	}
 	xfrm_dst_destroy(xdst);
 }
 
-- 
cgit v1.2.3


From 46517008e1168dc926cf2c47d529efc07eca85c0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 10 Jun 2012 00:04:12 -0700
Subject: ipv4: Kill ip_rt_frag_needed().

There is zero point to this function.

It's only real substance is to perform an extremely outdated BSD4.2
ICMP check, which we can safely remove.  If you really have a MTU
limited link being routed by a BSD4.2 derived system, here's a nickel
go buy yourself a real router.

The other actions of ip_rt_frag_needed(), checking and conditionally
updating the peer, are done by the per-protocol handlers of the ICMP
event.

TCP, UDP, et al. have a handler which will receive this event and
transmit it back into the associated route via dst_ops->update_pmtu().

This simplification is important, because it eliminates the one place
where we do not have a proper route context in which to make an
inetpeer lookup.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h  |  2 --
 net/ipv4/icmp.c      |  4 +---
 net/ipv4/route.c     | 61 ----------------------------------------------------
 net/rxrpc/ar-error.c |  4 ----
 4 files changed, 1 insertion(+), 70 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 6340c37677fc..cc693a5bb20d 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -215,8 +215,6 @@ static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 s
 	return ip_route_input_common(skb, dst, src, tos, devin, true);
 }
 
-extern unsigned short	ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
-					  unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
 
 extern unsigned int		inet_addr_type(struct net *net, __be32 addr);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0c78ef1e5dde..e1caa1abe5d1 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -673,9 +673,7 @@ static void icmp_unreach(struct sk_buff *skb)
 				LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
 					       &iph->daddr);
 			} else {
-				info = ip_rt_frag_needed(net, iph,
-							 ntohs(icmph->un.frag.mtu),
-							 skb->dev);
+				info = ntohs(icmph->un.frag.mtu);
 				if (!info)
 					goto out;
 			}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 03e5b614370e..4f5834c4a667 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1664,67 +1664,6 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-/*
- *	The last two values are not from the RFC but
- *	are needed for AMPRnet AX.25 paths.
- */
-
-static const unsigned short mtu_plateau[] =
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
-
-static inline unsigned short guess_mtu(unsigned short old_mtu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
-		if (old_mtu > mtu_plateau[i])
-			return mtu_plateau[i];
-	return 68;
-}
-
-unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
-				 unsigned short new_mtu,
-				 struct net_device *dev)
-{
-	unsigned short old_mtu = ntohs(iph->tot_len);
-	unsigned short est_mtu = 0;
-	struct inet_peer *peer;
-
-	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
-	if (peer) {
-		unsigned short mtu = new_mtu;
-
-		if (new_mtu < 68 || new_mtu >= old_mtu) {
-			/* BSD 4.2 derived systems incorrectly adjust
-			 * tot_len by the IP header length, and report
-			 * a zero MTU in the ICMP message.
-			 */
-			if (mtu == 0 &&
-			    old_mtu >= 68 + (iph->ihl << 2))
-				old_mtu -= iph->ihl << 2;
-			mtu = guess_mtu(old_mtu);
-		}
-
-		if (mtu < ip_rt_min_pmtu)
-			mtu = ip_rt_min_pmtu;
-		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
-			unsigned long pmtu_expires;
-
-			pmtu_expires = jiffies + ip_rt_mtu_expires;
-			if (!pmtu_expires)
-				pmtu_expires = 1UL;
-
-			est_mtu = mtu;
-			peer->pmtu_learned = mtu;
-			peer->pmtu_expires = pmtu_expires;
-			atomic_inc(&__rt_peer_genid);
-		}
-
-		inet_putpeer(peer);
-	}
-	return est_mtu ? : new_mtu;
-}
-
 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
 {
 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 5d6b572a6704..a9206087b4d7 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -81,10 +81,6 @@ void rxrpc_UDP_error_report(struct sock *sk)
 			_net("I/F MTU %u", mtu);
 		}
 
-		/* ip_rt_frag_needed() may have eaten the info */
-		if (mtu == 0)
-			mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
-
 		if (mtu == 0) {
 			/* they didn't give us a size, estimate one */
 			if (mtu > 1500) {
-- 
cgit v1.2.3


From b48c80ece973e9eddb042f6685b482b261ff0d47 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 10 Jun 2012 00:24:21 -0700
Subject: inet: Add family scope inetpeer flushes.

This implementation can deal with having many inetpeer roots, which is
a necessary prerequisite for per-FIB table rooted peer tables.

Each family (AF_INET, AF_INET6) has a sequence number which we bump
when we get a family invalidation request.

Each peer lookup cheaply checks whether the flush sequence of the
root we are using is out of date, and if so flushes it and updates
the sequence number.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  2 ++
 net/ipv4/inetpeer.c    | 28 ++++++++++++++++++++++++++++
 net/ipv4/route.c       |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index d432489e7109..e15c0862a686 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -68,6 +68,7 @@ struct inet_peer {
 struct inet_peer_base {
 	struct inet_peer __rcu	*root;
 	seqlock_t		lock;
+	u32			flush_seq;
 	int			total;
 };
 
@@ -168,6 +169,7 @@ extern void inet_putpeer(struct inet_peer *p);
 extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);
 
 extern void inetpeer_invalidate_tree(struct inet_peer_base *);
+extern void inetpeer_invalidate_family(int family);
 
 /*
  * temporary check to make sure we dont access rid, ip_id_count, tcp_ts,
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e4cba56a5349..cac02ad1425d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -86,10 +86,36 @@ void inet_peer_base_init(struct inet_peer_base *bp)
 {
 	bp->root = peer_avl_empty_rcu;
 	seqlock_init(&bp->lock);
+	bp->flush_seq = ~0U;
 	bp->total = 0;
 }
 EXPORT_SYMBOL_GPL(inet_peer_base_init);
 
+static atomic_t v4_seq = ATOMIC_INIT(0);
+static atomic_t v6_seq = ATOMIC_INIT(0);
+
+static atomic_t *inetpeer_seq_ptr(int family)
+{
+	return (family == AF_INET ? &v4_seq : &v6_seq);
+}
+
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+	atomic_t *fp = inetpeer_seq_ptr(family);
+
+	if (unlikely(base->flush_seq != atomic_read(fp))) {
+		inetpeer_invalidate_tree(base);
+		base->flush_seq = atomic_read(fp);
+	}
+}
+
+void inetpeer_invalidate_family(int family)
+{
+	atomic_t *fp = inetpeer_seq_ptr(family);
+
+	atomic_inc(fp);
+}
+
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 
 /* Exported for sysctl_net_ipv4.  */
@@ -437,6 +463,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 	unsigned int sequence;
 	int invalidated, gccnt = 0;
 
+	flush_check(base, daddr->family);
+
 	/* Attempt a lockless lookup first.
 	 * Because of a concurrent writer, we might not find an existing entry.
 	 */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4f5834c4a667..456a9470fb54 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -935,7 +935,7 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	inetpeer_invalidate_tree(net->ipv4.peers);
+	inetpeer_invalidate_family(AF_INET);
 }
 
 /*
-- 
cgit v1.2.3


From 8e77327783c753689a1a766ab9d301b81c2529f1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 11 Jun 2012 00:01:52 -0700
Subject: inet: Add inetpeer tree roots to the FIB tables.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 include/net/ip_fib.h  | 12 +++++++-----
 net/ipv4/fib_trie.c   |  3 +++
 net/ipv6/ip6_fib.c    |  5 +++++
 4 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3ac5f155c690..a192f7807659 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -237,6 +237,7 @@ struct fib6_table {
 	u32			tb6_id;
 	rwlock_t		tb6_lock;
 	struct fib6_node	tb6_root;
+	struct inet_peer_base	tb6_peers;
 };
 
 #define RT6_TABLE_UNSPEC	RT_TABLE_UNSPEC
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 78df0866cc38..4b347c0ca094 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -19,6 +19,7 @@
 #include <net/flow.h>
 #include <linux/seq_file.h>
 #include <net/fib_rules.h>
+#include <net/inetpeer.h>
 
 struct fib_config {
 	u8			fc_dst_len;
@@ -157,11 +158,12 @@ extern __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 					 FIB_RES_SADDR(net, res))
 
 struct fib_table {
-	struct hlist_node tb_hlist;
-	u32		tb_id;
-	int		tb_default;
-	int		tb_num_default;
-	unsigned long	tb_data[0];
+	struct hlist_node	tb_hlist;
+	u32			tb_id;
+	int			tb_default;
+	int			tb_num_default;
+	struct inet_peer_base	tb_peers;
+	unsigned long		tb_data[0];
 };
 
 extern int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18cbc15b20d5..9b0f25930fbc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1843,6 +1843,8 @@ int fib_table_flush(struct fib_table *tb)
 	if (ll && hlist_empty(&ll->list))
 		trie_leaf_remove(t, ll);
 
+	inetpeer_invalidate_tree(&tb->tb_peers);
+
 	pr_debug("trie_flush found=%d\n", found);
 	return found;
 }
@@ -1991,6 +1993,7 @@ struct fib_table *fib_trie_table(u32 id)
 	tb->tb_id = id;
 	tb->tb_default = -1;
 	tb->tb_num_default = 0;
+	inet_peer_base_init(&tb->tb_peers);
 
 	t = (struct trie *) tb->tb_data;
 	memset(t, 0, sizeof(*t));
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0c220a416626..7ef0743f06f0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -197,6 +197,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 		table->tb6_id = id;
 		table->tb6_root.leaf = net->ipv6.ip6_null_entry;
 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+		inet_peer_base_init(&table->tb6_peers);
 	}
 
 	return table;
@@ -1633,6 +1634,7 @@ static int __net_init fib6_net_init(struct net *net)
 	net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
@@ -1643,6 +1645,7 @@ static int __net_init fib6_net_init(struct net *net)
 	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
 #endif
 	fib6_tables_init(net);
 
@@ -1666,8 +1669,10 @@ static void fib6_net_exit(struct net *net)
 	del_timer_sync(&net->ipv6.ip6_fib_timer);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
 	kfree(net->ipv6.fib6_local_tbl);
 #endif
+	inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
 	kfree(net->ipv6.fib6_main_tbl);
 	kfree(net->ipv6.fib_table_hash);
 	kfree(net->ipv6.rt6_stats);
-- 
cgit v1.2.3


From 8b96d22d7a6ec999ae53ae86d829137503ceda65 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 11 Jun 2012 02:01:56 -0700
Subject: inet: Use FIB table peer roots in routes.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c |  8 ++++++--
 net/ipv6/route.c | 14 ++++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 456a9470fb54..4c33ce3000ed 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2125,7 +2125,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
-	rt_init_peer(rth, dev_net(rth->dst.dev)->ipv4.peers);
+	rt_init_peer(rth, &res->table->tb_peers);
 	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
@@ -2512,7 +2512,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_gateway = fl4->daddr;
 	rth->rt_spec_dst= fl4->saddr;
 	rth->rt_peer_genid = 0;
-	rt_init_peer(rth, dev_net(dev_out)->ipv4.peers);
+	rt_init_peer(rth, (res->table ?
+			   &res->table->tb_peers :
+			   dev_net(dev_out)->ipv4.peers));
 	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2561,6 +2563,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 	int orig_oif;
 
 	res.fi		= NULL;
+	res.table	= NULL;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	res.r		= NULL;
 #endif
@@ -2666,6 +2669,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
 	if (fib_lookup(net, fl4, &res)) {
 		res.fi = NULL;
+		res.table = NULL;
 		if (fl4->flowi4_oif) {
 			/* Apparently, routing tables are wrong. Assume,
 			   that the destination is on link.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 17a9b8687f29..d9ba4808f26a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -260,7 +260,8 @@ static struct rt6_info ip6_blk_hole_entry_template = {
 /* allocate dst with ip6_dst_ops */
 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 					     struct net_device *dev,
-					     int flags)
+					     int flags,
+					     struct fib6_table *table)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 					0, 0, flags);
@@ -268,7 +269,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 	if (rt) {
 		memset(&rt->rt6i_table, 0,
 		       sizeof(*rt) - sizeof(struct dst_entry));
-		rt6_init_peer(rt, net->ipv6.peers);
+		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
 	}
 	return rt;
 }
@@ -1114,7 +1115,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	if (unlikely(!idev))
 		return ERR_PTR(-ENODEV);
 
-	rt = ip6_dst_alloc(net, dev, 0);
+	rt = ip6_dst_alloc(net, dev, 0, NULL);
 	if (unlikely(!rt)) {
 		in6_dev_put(idev);
 		dst = ERR_PTR(-ENOMEM);
@@ -1296,7 +1297,7 @@ int ip6_route_add(struct fib6_config *cfg)
 	if (!table)
 		goto out;
 
-	rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT);
+	rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
 
 	if (!rt) {
 		err = -ENOMEM;
@@ -1818,7 +1819,8 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
 				    const struct in6_addr *dest)
 {
 	struct net *net = dev_net(ort->dst.dev);
-	struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0);
+	struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
+					    ort->rt6i_table);
 
 	if (rt) {
 		rt->dst.input = ort->dst.input;
@@ -2102,7 +2104,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 				    bool anycast)
 {
 	struct net *net = dev_net(idev->dev);
-	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0);
+	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
 	int err;
 
 	if (!rt) {
-- 
cgit v1.2.3


From 7b34ca2ac7063f4ebf07f85fd75253ed84d5c648 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 11 Jun 2012 04:13:57 -0700
Subject: inet: Avoid potential NULL peer dereference.

We handle NULL in rt{,6}_set_peer but then our caller will try to pass
that NULL pointer into inet_putpeer() which isn't ready for it.

Fix this by moving the NULL check one level up, and then remove the
now unnecessary NULL check from inetpeer_ptr_set_peer().

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  2 +-
 net/ipv4/route.c       | 11 ++++++-----
 net/ipv6/route.c       | 10 ++++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index e15c0862a686..c27c8f10ebdc 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -104,7 +104,7 @@ static inline bool inetpeer_ptr_set_peer(unsigned long *ptr, struct inet_peer *p
 	unsigned long val = (unsigned long) peer;
 	unsigned long orig = *ptr;
 
-	if (!(orig & INETPEER_BASE_BIT) || !val ||
+	if (!(orig & INETPEER_BASE_BIT) ||
 	    cmpxchg(ptr, orig, val) != orig)
 		return false;
 	return true;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c33ce3000ed..842510d50453 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1333,11 +1333,12 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 		return;
 
 	peer = inet_getpeer_v4(base, daddr, create);
-
-	if (!rt_set_peer(rt, peer))
-		inet_putpeer(peer);
-	else
-		rt->rt_peer_genid = rt_peer_genid();
+	if (peer) {
+		if (!rt_set_peer(rt, peer))
+			inet_putpeer(peer);
+		else
+			rt->rt_peer_genid = rt_peer_genid();
+	}
 }
 
 /*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d9ba4808f26a..58a3ec23da2f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -313,10 +313,12 @@ void rt6_bind_peer(struct rt6_info *rt, int create)
 		return;
 
 	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
-	if (!rt6_set_peer(rt, peer))
-		inet_putpeer(peer);
-	else
-		rt->rt6i_peer_genid = rt6_peer_genid();
+	if (peer) {
+		if (!rt6_set_peer(rt, peer))
+			inet_putpeer(peer);
+		else
+			rt->rt6i_peer_genid = rt6_peer_genid();
+	}
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
-- 
cgit v1.2.3


From d0daebc3d622f95db181601cb0c4a0781f74f758 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 12 Jun 2012 00:44:01 +0000
Subject: ipv4: Add interface option to enable routing of 127.0.0.0/8

Routing of 127/8 is tradtionally forbidden, we consider
packets from that address block martian when routing and do
not process corresponding ARP requests.

This is a sane default but renders a huge address space
practically unuseable.

The RFC states that no address within the 127/8 block should
ever appear on any network anywhere but it does not forbid
the use of such addresses outside of the loopback device in
particular. For example to address a pool of virtual guests
behind a load balancer.

This patch adds a new interface option 'route_localnet'
enabling routing of the 127/8 address block and processing
of ARP requests on a specific interface.

Note that for the feature to work, the default local route
covering 127/8 dev lo needs to be removed.

Example:
  $ sysctl -w net.ipv4.conf.eth0.route_localnet=1
  $ ip route del 127.0.0.0/8 dev lo table local
  $ ip addr add 127.1.0.1/16 dev eth0
  $ ip route flush cache

V2: Fix invalid check to auto flush cache (thanks davem)

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  5 +++++
 include/linux/inetdevice.h             |  2 ++
 net/ipv4/arp.c                         |  3 ++-
 net/ipv4/devinet.c                     |  5 ++++-
 net/ipv4/route.c                       | 30 +++++++++++++++++++++---------
 5 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6f896b94abdc..99d0e0504d6e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -862,6 +862,11 @@ accept_local - BOOLEAN
 	local interfaces over the wire and have them accepted properly.
 	default FALSE
 
+route_localnet - BOOLEAN
+	Do not consider loopback addresses as martian source or destination
+	while routing. This enables the use of 127/8 for local routing purposes.
+	default FALSE
+
 rp_filter - INTEGER
 	0 - No source validation.
 	1 - Strict mode as defined in RFC3704 Strict Reverse Path
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 597f4a9f3240..67f9ddacb70c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -38,6 +38,7 @@ enum
 	IPV4_DEVCONF_ACCEPT_LOCAL,
 	IPV4_DEVCONF_SRC_VMARK,
 	IPV4_DEVCONF_PROXY_ARP_PVLAN,
+	IPV4_DEVCONF_ROUTE_LOCALNET,
 	__IPV4_DEVCONF_MAX
 };
 
@@ -131,6 +132,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_PROMOTE_SECONDARIES(in_dev) \
 					IN_DEV_ORCONF((in_dev), \
 						      PROMOTE_SECONDARIES)
+#define IN_DEV_ROUTE_LOCALNET(in_dev)	IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET)
 
 #define IN_DEV_RX_REDIRECTS(in_dev) \
 	((IN_DEV_FORWARD(in_dev) && \
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index cda37be02f8d..2e560f0c757d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -790,7 +790,8 @@ static int arp_process(struct sk_buff *skb)
  *	Check for bad requests for 127.x.x.x and requests for multicast
  *	addresses.  If this is one such, delete it.
  */
-	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+	if (ipv4_is_multicast(tip) ||
+	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
 		goto out;
 
 /*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 10e15a144e95..44bf82e3aef7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 
 		if (cnf == net->ipv4.devconf_dflt)
 			devinet_copy_dflt_conf(net, i);
-		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
 			if ((new_value == 0) && (old_value != 0))
 				rt_cache_flush(net, 0);
 	}
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table {
 					      "force_igmp_version"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
 					      "promote_secondaries"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+					      "route_localnet"),
 	},
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 842510d50453..655506af47ca 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1960,9 +1960,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		return -EINVAL;
 
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
-	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
+	    skb->protocol != htons(ETH_P_IP))
 		goto e_inval;
 
+	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+		if (ipv4_is_loopback(saddr))
+			goto e_inval;
+
 	if (ipv4_is_zeronet(saddr)) {
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
@@ -2203,8 +2207,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	   by fib_lookup.
 	 */
 
-	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
-	    ipv4_is_loopback(saddr))
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 		goto martian_source;
 
 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
@@ -2216,9 +2219,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_zeronet(saddr))
 		goto martian_source;
 
-	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
+	if (ipv4_is_zeronet(daddr))
 		goto martian_destination;
 
+	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
+		if (ipv4_is_loopback(daddr))
+			goto martian_destination;
+
+		if (ipv4_is_loopback(saddr))
+			goto martian_source;
+	}
+
 	/*
 	 *	Now we are ready to route packet.
 	 */
@@ -2457,9 +2468,14 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	u16 type = res->type;
 	struct rtable *rth;
 
-	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+	in_dev = __in_dev_get_rcu(dev_out);
+	if (!in_dev)
 		return ERR_PTR(-EINVAL);
 
+	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+			return ERR_PTR(-EINVAL);
+
 	if (ipv4_is_lbcast(fl4->daddr))
 		type = RTN_BROADCAST;
 	else if (ipv4_is_multicast(fl4->daddr))
@@ -2470,10 +2486,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
-	in_dev = __in_dev_get_rcu(dev_out);
-	if (!in_dev)
-		return ERR_PTR(-EINVAL);
-
 	if (type == RTN_BROADCAST) {
 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
 		fi = NULL;
-- 
cgit v1.2.3


From 95603e2293de556de7e82221649bfd7fd98b64a3 Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Tue, 12 Jun 2012 10:16:35 +0000
Subject: net-next: add dev_loopback_xmit() to avoid duplicate code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dev_loopback_xmit() in order to deduplicate functions
ip_dev_loopback_xmit() (in net/ipv4/ip_output.c) and
ip6_dev_loopback_xmit() (in net/ipv6/ip6_output.c).

I was about to reinvent the wheel when I noticed that
ip_dev_loopback_xmit() and ip6_dev_loopback_xmit() do exactly what I
need and are not IP-only functions, but they were not available to reuse
elsewhere.

ip6_dev_loopback_xmit() does not have line "skb_dst_force(skb);", but I
understand that this is harmless, and should be in dev_loopback_xmit().

Signed-off-by: Michel Machado <michel@digirati.com.br>
CC: "David S. Miller" <davem@davemloft.net>
CC: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
CC: James Morris <jmorris@namei.org>
CC: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jpirko@redhat.com>
CC: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
CC: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 17 +++++++++++++++++
 net/ipv4/ip_output.c      | 17 ++---------------
 net/ipv6/ip6_output.c     | 15 +--------------
 4 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a0b84e3b087c..2c2ecea28a1b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1627,6 +1627,7 @@ extern int		dev_alloc_name(struct net_device *dev, const char *name);
 extern int		dev_open(struct net_device *dev);
 extern int		dev_close(struct net_device *dev);
 extern void		dev_disable_lro(struct net_device *dev);
+extern int		dev_loopback_xmit(struct sk_buff *newskb);
 extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
 extern void		unregister_netdevice_queue(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index cd0981977f5c..c6e29ea65bd9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2475,6 +2475,23 @@ static void skb_update_prio(struct sk_buff *skb)
 static DEFINE_PER_CPU(int, xmit_recursion);
 #define RECURSION_LIMIT 10
 
+/**
+ *	dev_loopback_xmit - loop back @skb
+ *	@skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct sk_buff *skb)
+{
+	skb_reset_mac_header(skb);
+	__skb_pull(skb, skb_network_offset(skb));
+	skb->pkt_type = PACKET_LOOPBACK;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	WARN_ON(!skb_dst(skb));
+	skb_dst_force(skb);
+	netif_rx_ni(skb);
+	return 0;
+}
+EXPORT_SYMBOL(dev_loopback_xmit);
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b99ca4e154b9..0f3185a662c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
 
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
-	skb_reset_mac_header(newskb);
-	__skb_pull(newskb, skb_network_offset(newskb));
-	newskb->pkt_type = PACKET_LOOPBACK;
-	newskb->ip_summed = CHECKSUM_UNNECESSARY;
-	WARN_ON(!skb_dst(newskb));
-	skb_dst_force(newskb);
-	netif_rx_ni(newskb);
-	return 0;
-}
-
 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 {
 	int ttl = inet->uc_ttl;
@@ -281,7 +268,7 @@ int ip_mc_output(struct sk_buff *skb)
 			if (newskb)
 				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 					newskb, NULL, newskb->dev,
-					ip_dev_loopback_xmit);
+					dev_loopback_xmit);
 		}
 
 		/* Multicasts with ttl 0 must not go beyond the host */
@@ -296,7 +283,7 @@ int ip_mc_output(struct sk_buff *skb)
 		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 		if (newskb)
 			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
-				NULL, newskb->dev, ip_dev_loopback_xmit);
+				NULL, newskb->dev, dev_loopback_xmit);
 	}
 
 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 62fcf3e48aca..ee1bb450bfe4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -83,19 +83,6 @@ int ip6_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip6_local_out);
 
-/* dev_loopback_xmit for use with netfilter. */
-static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
-{
-	skb_reset_mac_header(newskb);
-	__skb_pull(newskb, skb_network_offset(newskb));
-	newskb->pkt_type = PACKET_LOOPBACK;
-	newskb->ip_summed = CHECKSUM_UNNECESSARY;
-	WARN_ON(!skb_dst(newskb));
-
-	netif_rx_ni(newskb);
-	return 0;
-}
-
 static int ip6_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -121,7 +108,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
 			if (newskb)
 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 					newskb, NULL, newskb->dev,
-					ip6_dev_loopback_xmit);
+					dev_loopback_xmit);
 
 			if (ipv6_hdr(skb)->hop_limit == 0) {
 				IP6_INC_STATS(dev_net(dev), idev,
-- 
cgit v1.2.3


From 36393395536064e483b73d173f6afc103eadfbc4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 14 Jun 2012 22:21:46 -0700
Subject: ipv4: Handle PMTU in all ICMP error handlers.

With ip_rt_frag_needed() removed, we have to explicitly update PMTU
information in every ICMP error handler.

Create two helper functions to facilitate this.

1) ipv4_sk_update_pmtu()

   This updates the PMTU when we have a socket context to
   work with.

2) ipv4_update_pmtu()

   Raw version, used when no socket context is available.  For this
   interface, we essentially just pass in explicit arguments for
   the flow identity information we would have extracted from the
   socket.

   And you'll notice that ipv4_sk_update_pmtu() is simply implemented
   in terms of ipv4_update_pmtu()

Note that __ip_route_output_key() is used, rather than something like
ip_route_output_flow() or ip_route_output_key().  This is because we
absolutely do not want to end up with a route that does IPSEC
encapsulation and the like.  Instead, we only want the route that
would get us to the node described by the outermost IP header.

Reported-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  5 ++++-
 net/ipv4/ah4.c      |  1 +
 net/ipv4/esp4.c     |  1 +
 net/ipv4/ip_gre.c   | 14 ++++++++++----
 net/ipv4/ipcomp.c   |  1 +
 net/ipv4/ipip.c     | 15 +++++++++++----
 net/ipv4/ping.c     |  1 +
 net/ipv4/raw.c      |  3 +++
 net/ipv4/route.c    | 28 ++++++++++++++++++++++++++++
 net/ipv4/udp.c      |  1 +
 net/ipv6/sit.c      | 15 +++++++++++----
 11 files changed, 72 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index a36ae429ed5d..47eb25ac1f7f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -215,7 +215,10 @@ static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 s
 	return ip_route_input_common(skb, dst, src, tos, devin, true);
 }
 
-extern void		ip_rt_send_redirect(struct sk_buff *skb);
+extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+			     int oif, u32 mark, u8 protocol, int flow_flags);
+extern void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
+extern void ip_rt_send_redirect(struct sk_buff *skb);
 
 extern unsigned int		inet_addr_type(struct net *net, __be32 addr);
 extern unsigned int		inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e8f2617ecd47..916d5ecaf6c6 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -408,6 +408,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
 		return;
 	pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
 		 ntohl(ah->spi), ntohl(iph->daddr));
+	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index cb982a61536f..7b95b49a36ce 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -494,6 +494,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
 		return;
 	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
 		 ntohl(esph->spi), ntohl(iph->daddr));
+	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f49047b79609..594cec35ac4d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
 			return;
-		case ICMP_FRAG_NEEDED:
-			/* Soft state for pmtu is maintained by IP core. */
-			return;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -538,7 +535,16 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 				flags & GRE_KEY ?
 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 				p[1]);
-	if (t == NULL || t->parms.iph.daddr == 0 ||
+	if (t == NULL)
+		goto out;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 t->parms.link, 0, IPPROTO_GRE, 0);
+		goto out;
+	}
+
+	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
 		goto out;
 
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 63b64c45a826..b91375482d84 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -42,6 +42,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
 		return;
 	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
 		 spi, &iph->daddr);
+	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2d0f99bf61b3..715338a1b205 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
 			return 0;
-		case ICMP_FRAG_NEEDED:
-			/* Soft state for pmtu is maintained by IP core. */
-			return 0;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -369,7 +366,17 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 
 	rcu_read_lock();
 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
-	if (t == NULL || t->parms.iph.daddr == 0)
+	if (t == NULL)
+		goto out;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
+		err = 0;
+		goto out;
+	}
+
+	if (t->parms.iph.daddr == 0)
 		goto out;
 
 	err = 0;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2c00e8bf684d..340fcf29a966 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info)
 		break;
 	case ICMP_DEST_UNREACH:
 		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			ipv4_sk_update_pmtu(skb, sk, info);
 			if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b818f3e4..659ddfb10947 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -216,6 +216,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 	int err = 0;
 	int harderr = 0;
 
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		ipv4_sk_update_pmtu(skb, sk, info);
+
 	/* Report error on raw socket, if:
 	   1. User requested ip_recverr.
 	   2. Socket is connected (otherwise the error indication
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 655506af47ca..41df5297a412 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1711,6 +1711,34 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+		      int oif, u32 mark, u8 protocol, int flow_flags)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+
+	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
+			   protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
+			   iph->daddr, iph->saddr, 0, 0);
+	rt = __ip_route_output_key(net, &fl4);
+	if (!IS_ERR(rt)) {
+		ip_rt_update_pmtu(&rt->dst, mtu);
+		ip_rt_put(rt);
+	}
+}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
+
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	return ipv4_update_pmtu(skb, sock_net(sk), mtu,
+				sk->sk_bound_dev_if, sk->sk_mark,
+				inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+				inet_sk_flowi_flags(sk));
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
 static void ipv4_validate_peer(struct rtable *rt)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eaca73644e79..db017efb76ea 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -615,6 +615,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 		break;
 	case ICMP_DEST_UNREACH:
 		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			ipv4_sk_update_pmtu(skb, sk, info);
 			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
 				err = EMSGSIZE;
 				harderr = 1;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 60415711563f..49aea94c9be3 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -527,9 +527,6 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
 			return 0;
-		case ICMP_FRAG_NEEDED:
-			/* Soft state for pmtu is maintained by IP core. */
-			return 0;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -551,7 +548,17 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 				skb->dev,
 				iph->daddr,
 				iph->saddr);
-	if (t == NULL || t->parms.iph.daddr == 0)
+	if (t == NULL)
+		goto out;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 t->dev->ifindex, 0, IPPROTO_IPV6, 0);
+		err = 0;
+		goto out;
+	}
+
+	if (t->parms.iph.daddr == 0)
 		goto out;
 
 	err = 0;
-- 
cgit v1.2.3


From 1afc56794e03229fa53cfa3c5012704d226e1dec Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 7 Jun 2012 12:11:50 +0200
Subject: netfilter: nf_ct_helper: implement variable length helper private
 data

This patch uses the new variable length conntrack extensions.

Instead of using union nf_conntrack_help that contain all the
helper private data information, we allocate variable length
area to store the private helper data.

This patch includes the modification of all existing helpers.
It also includes a couple of include header to avoid compilation
warnings.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_sip.h  |  2 ++
 include/net/netfilter/nf_conntrack.h        | 35 ++---------------------
 include/net/netfilter/nf_conntrack_helper.h | 15 +++++++++-
 net/ipv4/netfilter/nf_nat_amanda.c          |  4 +--
 net/ipv4/netfilter/nf_nat_h323.c            |  8 +++---
 net/ipv4/netfilter/nf_nat_pptp.c            |  6 ++--
 net/ipv4/netfilter/nf_nat_tftp.c            |  4 +--
 net/netfilter/nf_conntrack_core.c           |  3 +-
 net/netfilter/nf_conntrack_ftp.c            |  3 +-
 net/netfilter/nf_conntrack_h323_main.c      | 16 +++++++----
 net/netfilter/nf_conntrack_helper.c         | 11 +++++---
 net/netfilter/nf_conntrack_netlink.c        |  4 +--
 net/netfilter/nf_conntrack_pptp.c           | 17 +++++------
 net/netfilter/nf_conntrack_proto_gre.c      | 16 +++++------
 net/netfilter/nf_conntrack_sane.c           |  4 +--
 net/netfilter/nf_conntrack_sip.c            | 25 ++++++++--------
 net/netfilter/xt_CT.c                       | 44 +++++++++++++++++------------
 17 files changed, 111 insertions(+), 106 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index 0ce91d56a5f2..0dfc8b7210a3 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -2,6 +2,8 @@
 #define __NF_CONNTRACK_SIP_H__
 #ifdef __KERNEL__
 
+#include <net/netfilter/nf_conntrack_expect.h>
+
 #define SIP_PORT	5060
 #define SIP_TIMEOUT	3600
 
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index cce7f6a798bf..f1494feba79f 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -39,36 +39,6 @@ union nf_conntrack_expect_proto {
 	/* insert expect proto private data here */
 };
 
-/* Add protocol helper include file here */
-#include <linux/netfilter/nf_conntrack_ftp.h>
-#include <linux/netfilter/nf_conntrack_pptp.h>
-#include <linux/netfilter/nf_conntrack_h323.h>
-#include <linux/netfilter/nf_conntrack_sane.h>
-#include <linux/netfilter/nf_conntrack_sip.h>
-
-/* per conntrack: application helper private data */
-union nf_conntrack_help {
-	/* insert conntrack helper private data (master) here */
-#if defined(CONFIG_NF_CONNTRACK_FTP) || defined(CONFIG_NF_CONNTRACK_FTP_MODULE)
-	struct nf_ct_ftp_master ct_ftp_info;
-#endif
-#if defined(CONFIG_NF_CONNTRACK_PPTP) || \
-    defined(CONFIG_NF_CONNTRACK_PPTP_MODULE)
-	struct nf_ct_pptp_master ct_pptp_info;
-#endif
-#if defined(CONFIG_NF_CONNTRACK_H323) || \
-    defined(CONFIG_NF_CONNTRACK_H323_MODULE)
-	struct nf_ct_h323_master ct_h323_info;
-#endif
-#if defined(CONFIG_NF_CONNTRACK_SANE) || \
-    defined(CONFIG_NF_CONNTRACK_SANE_MODULE)
-	struct nf_ct_sane_master ct_sane_info;
-#endif
-#if defined(CONFIG_NF_CONNTRACK_SIP) || defined(CONFIG_NF_CONNTRACK_SIP_MODULE)
-	struct nf_ct_sip_master ct_sip_info;
-#endif
-};
-
 #include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/timer.h>
@@ -89,12 +59,13 @@ struct nf_conn_help {
 	/* Helper. if any */
 	struct nf_conntrack_helper __rcu *helper;
 
-	union nf_conntrack_help help;
-
 	struct hlist_head expectations;
 
 	/* Current number of expected connections */
 	u8 expecting[NF_CT_MAX_EXPECT_CLASSES];
+
+	/* private helper information. */
+	char data[];
 };
 
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 5f5a4d9d4df5..061352f71a84 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -11,6 +11,7 @@
 #define _NF_CONNTRACK_HELPER_H
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_expect.h>
 
 struct module;
 
@@ -23,6 +24,9 @@ struct nf_conntrack_helper {
 	struct module *me;		/* pointer to self */
 	const struct nf_conntrack_expect_policy *expect_policy;
 
+	/* length of internal data, ie. sizeof(struct nf_ct_*_master) */
+	size_t data_len;
+
 	/* Tuple of things we will help (compared against server response) */
 	struct nf_conntrack_tuple tuple;
 
@@ -48,7 +52,7 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum);
 extern int nf_conntrack_helper_register(struct nf_conntrack_helper *);
 extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *);
 
-extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
+extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, struct nf_conntrack_helper *helper, gfp_t gfp);
 
 extern int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 				     gfp_t flags);
@@ -60,6 +64,15 @@ static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
 	return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
 }
 
+static inline void *nfct_help_data(const struct nf_conn *ct)
+{
+	struct nf_conn_help *help;
+
+	help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+
+	return (void *)help->data;
+}
+
 extern int nf_conntrack_helper_init(struct net *net);
 extern void nf_conntrack_helper_fini(struct net *net);
 
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 7b22382ff0e9..3c04d24e2976 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -13,10 +13,10 @@
 #include <linux/skbuff.h>
 #include <linux/udp.h>
 
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_amanda.h>
 
 MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index cad29c121318..c6784a18c1c4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 			unsigned char **data,
 			TransportAddress *taddr, int count)
 {
-	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	const struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	int i;
 	__be16 port;
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 			struct nf_conntrack_expect *rtp_exp,
 			struct nf_conntrack_expect *rtcp_exp)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	int i;
 	u_int16_t nated_port;
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 		    TransportAddress *taddr, __be16 port,
 		    struct nf_conntrack_expect *exp)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	u_int16_t nated_port = ntohs(port);
 
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 		    unsigned char **data, TransportAddress *taddr, int idx,
 		    __be16 port, struct nf_conntrack_expect *exp)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	u_int16_t nated_port = ntohs(port);
 	union nf_inet_addr addr;
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index c273d58980ae..388140881ebe 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	const struct nf_nat_pptp *nat_pptp_info;
 	struct nf_nat_ipv4_range range;
 
-	ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
+	ct_pptp_info = nfct_help_data(master);
 	nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
 
 	/* And here goes the grand finale of corrosion... */
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
 	__be16 new_callid;
 	unsigned int cid_off;
 
-	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+	ct_pptp_info = nfct_help_data(ct);
 	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
 
 	new_callid = ct_pptp_info->pns_call_id;
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
 	struct nf_ct_pptp_master *ct_pptp_info;
 	struct nf_nat_pptp *nat_pptp_info;
 
-	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+	ct_pptp_info = nfct_help_data(ct);
 	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
 
 	/* save original PAC call ID in nat_info */
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index a2901bf829c0..9dbb8d284f99 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -8,10 +8,10 @@
 #include <linux/module.h>
 #include <linux/udp.h>
 
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_tftp.h>
 
 MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1ee2082b81b5..cf4875565d67 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -819,7 +819,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		__set_bit(IPS_EXPECTED_BIT, &ct->status);
 		ct->master = exp->master;
 		if (exp->helper) {
-			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+			help = nf_ct_helper_ext_add(ct, exp->helper,
+						    GFP_ATOMIC);
 			if (help)
 				rcu_assign_pointer(help->helper, exp->helper);
 		}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 44e47c9e14fb..4bb771d1f57a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -358,7 +358,7 @@ static int help(struct sk_buff *skb,
 	u32 seq;
 	int dir = CTINFO2DIR(ctinfo);
 	unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
-	struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info;
+	struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
 	struct nf_conntrack_expect *exp;
 	union nf_inet_addr *daddr;
 	struct nf_conntrack_man cmd = {};
@@ -554,6 +554,7 @@ static int __init nf_conntrack_ftp_init(void)
 		ftp[i][0].tuple.src.l3num = PF_INET;
 		ftp[i][1].tuple.src.l3num = PF_INET6;
 		for (j = 0; j < 2; j++) {
+			ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
 			ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
 			ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
 			ftp[i][j].expect_policy = &ftp_exp_policy;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 46d69d7f1bb4..ed2199280527 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -114,7 +114,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
 			 struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			 unsigned char **data, int *datalen, int *dataoff)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	const struct tcphdr *th;
 	struct tcphdr _tcph;
@@ -618,6 +618,7 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = {
 static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
 	.name			= "H.245",
 	.me			= THIS_MODULE,
+	.data_len		= sizeof(struct nf_ct_h323_master),
 	.tuple.src.l3num	= AF_UNSPEC,
 	.tuple.dst.protonum	= IPPROTO_UDP,
 	.help			= h245_help,
@@ -1170,6 +1171,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
 	{
 		.name			= "Q.931",
 		.me			= THIS_MODULE,
+		.data_len		= sizeof(struct nf_ct_h323_master),
 		.tuple.src.l3num	= AF_INET,
 		.tuple.src.u.tcp.port	= cpu_to_be16(Q931_PORT),
 		.tuple.dst.protonum	= IPPROTO_TCP,
@@ -1245,7 +1247,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
 		       unsigned char **data,
 		       TransportAddress *taddr, int count)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	int ret = 0;
 	int i;
@@ -1360,7 +1362,7 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned char **data, RegistrationRequest *rrq)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int ret;
 	typeof(set_ras_addr_hook) set_ras_addr;
 
@@ -1395,7 +1397,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned char **data, RegistrationConfirm *rcf)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
 	struct nf_conntrack_expect *exp;
@@ -1444,7 +1446,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned char **data, UnregistrationRequest *urq)
 {
-	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	int ret;
 	typeof(set_sig_addr_hook) set_sig_addr;
@@ -1476,7 +1478,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
 		       enum ip_conntrack_info ctinfo,
 		       unsigned char **data, AdmissionRequest *arq)
 {
-	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	const struct nf_ct_h323_master *info = nfct_help_data(ct);
 	int dir = CTINFO2DIR(ctinfo);
 	__be16 port;
 	union nf_inet_addr addr;
@@ -1743,6 +1745,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
 	{
 		.name			= "RAS",
 		.me			= THIS_MODULE,
+		.data_len		= sizeof(struct nf_ct_h323_master),
 		.tuple.src.l3num	= AF_INET,
 		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),
 		.tuple.dst.protonum	= IPPROTO_UDP,
@@ -1752,6 +1755,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
 	{
 		.name			= "RAS",
 		.me			= THIS_MODULE,
+		.data_len		= sizeof(struct nf_ct_h323_master),
 		.tuple.src.l3num	= AF_INET6,
 		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),
 		.tuple.dst.protonum	= IPPROTO_UDP,
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 4fa2ff961f5a..9c18ecb0ab81 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -161,11 +161,14 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
 
-struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
+struct nf_conn_help *
+nf_ct_helper_ext_add(struct nf_conn *ct,
+		     struct nf_conntrack_helper *helper, gfp_t gfp)
 {
 	struct nf_conn_help *help;
 
-	help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp);
+	help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER,
+				    helper->data_len, gfp);
 	if (help)
 		INIT_HLIST_HEAD(&help->expectations);
 	else
@@ -218,13 +221,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	}
 
 	if (help == NULL) {
-		help = nf_ct_helper_ext_add(ct, flags);
+		help = nf_ct_helper_ext_add(ct, helper, flags);
 		if (help == NULL) {
 			ret = -ENOMEM;
 			goto out;
 		}
 	} else {
-		memset(&help->help, 0, sizeof(help->help));
+		memset(help->data, 0, helper->data_len);
 	}
 
 	rcu_assign_pointer(help->helper, helper);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6f4b00a8fc73..a08892048b46 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1218,7 +1218,7 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
 		if (help->helper)
 			return -EBUSY;
 		/* need to zero data of old helper */
-		memset(&help->help, 0, sizeof(help->help));
+		memset(help->data, 0, help->helper->data_len);
 	} else {
 		/* we cannot set a helper for an existing conntrack */
 		return -EOPNOTSUPP;
@@ -1440,7 +1440,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 		} else {
 			struct nf_conn_help *help;
 
-			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+			help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
 			if (help == NULL) {
 				err = -ENOMEM;
 				goto err2;
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 31d56b23b9e9..6fed9ec35248 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -174,7 +174,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
 static void pptp_destroy_siblings(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
-	const struct nf_conn_help *help = nfct_help(ct);
+	const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
 	struct nf_conntrack_tuple t;
 
 	nf_ct_gre_keymap_destroy(ct);
@@ -182,16 +182,16 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 	/* try original (pns->pac) tuple */
 	memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
 	t.dst.protonum = IPPROTO_GRE;
-	t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id;
-	t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id;
+	t.src.u.gre.key = ct_pptp_info->pns_call_id;
+	t.dst.u.gre.key = ct_pptp_info->pac_call_id;
 	if (!destroy_sibling_or_exp(net, ct, &t))
 		pr_debug("failed to timeout original pns->pac ct/exp\n");
 
 	/* try reply (pac->pns) tuple */
 	memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
 	t.dst.protonum = IPPROTO_GRE;
-	t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id;
-	t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id;
+	t.src.u.gre.key = ct_pptp_info->pac_call_id;
+	t.dst.u.gre.key = ct_pptp_info->pns_call_id;
 	if (!destroy_sibling_or_exp(net, ct, &t))
 		pr_debug("failed to timeout reply pac->pns ct/exp\n");
 }
@@ -269,7 +269,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
 		 struct nf_conn *ct,
 		 enum ip_conntrack_info ctinfo)
 {
-	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	struct nf_ct_pptp_master *info = nfct_help_data(ct);
 	u_int16_t msg;
 	__be16 cid = 0, pcid = 0;
 	typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
@@ -396,7 +396,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
 		  struct nf_conn *ct,
 		  enum ip_conntrack_info ctinfo)
 {
-	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	struct nf_ct_pptp_master *info = nfct_help_data(ct);
 	u_int16_t msg;
 	__be16 cid = 0, pcid = 0;
 	typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
@@ -506,7 +506,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
 
 {
 	int dir = CTINFO2DIR(ctinfo);
-	const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	const struct nf_ct_pptp_master *info = nfct_help_data(ct);
 	const struct tcphdr *tcph;
 	struct tcphdr _tcph;
 	const struct pptp_pkt_hdr *pptph;
@@ -592,6 +592,7 @@ static const struct nf_conntrack_expect_policy pptp_exp_policy = {
 static struct nf_conntrack_helper pptp __read_mostly = {
 	.name			= "pptp",
 	.me			= THIS_MODULE,
+	.data_len		= sizeof(struct nf_ct_pptp_master),
 	.tuple.src.l3num	= AF_INET,
 	.tuple.src.u.tcp.port	= cpu_to_be16(PPTP_CONTROL_PORT),
 	.tuple.dst.protonum	= IPPROTO_TCP,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 25ba5a2f5edc..5cac41c2fa09 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -117,10 +117,10 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
 {
 	struct net *net = nf_ct_net(ct);
 	struct netns_proto_gre *net_gre = gre_pernet(net);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
 	struct nf_ct_gre_keymap **kmp, *km;
 
-	kmp = &help->help.ct_pptp_info.keymap[dir];
+	kmp = &ct_pptp_info->keymap[dir];
 	if (*kmp) {
 		/* check whether it's a retransmission */
 		read_lock_bh(&net_gre->keymap_lock);
@@ -158,19 +158,19 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	struct netns_proto_gre *net_gre = gre_pernet(net);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
 	enum ip_conntrack_dir dir;
 
 	pr_debug("entering for ct %p\n", ct);
 
 	write_lock_bh(&net_gre->keymap_lock);
 	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
-		if (help->help.ct_pptp_info.keymap[dir]) {
+		if (ct_pptp_info->keymap[dir]) {
 			pr_debug("removing %p from list\n",
-				 help->help.ct_pptp_info.keymap[dir]);
-			list_del(&help->help.ct_pptp_info.keymap[dir]->list);
-			kfree(help->help.ct_pptp_info.keymap[dir]);
-			help->help.ct_pptp_info.keymap[dir] = NULL;
+				 ct_pptp_info->keymap[dir]);
+			list_del(&ct_pptp_info->keymap[dir]->list);
+			kfree(ct_pptp_info->keymap[dir]);
+			ct_pptp_info->keymap[dir] = NULL;
 		}
 	}
 	write_unlock_bh(&net_gre->keymap_lock);
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index ec3fc18c4ef6..295429f39088 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -69,13 +69,12 @@ static int help(struct sk_buff *skb,
 	void *sb_ptr;
 	int ret = NF_ACCEPT;
 	int dir = CTINFO2DIR(ctinfo);
-	struct nf_ct_sane_master *ct_sane_info;
+	struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple *tuple;
 	struct sane_request *req;
 	struct sane_reply_net_start *reply;
 
-	ct_sane_info = &nfct_help(ct)->help.ct_sane_info;
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED &&
 	    ctinfo != IP_CT_ESTABLISHED_REPLY)
@@ -203,6 +202,7 @@ static int __init nf_conntrack_sane_init(void)
 		sane[i][0].tuple.src.l3num = PF_INET;
 		sane[i][1].tuple.src.l3num = PF_INET6;
 		for (j = 0; j < 2; j++) {
+			sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
 			sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
 			sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
 			sane[i][j].expect_policy = &sane_exp_policy;
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index dfd3ff382243..758a1bacc126 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1075,12 +1075,12 @@ static int process_invite_response(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dataoff, dptr, datalen, cseq);
-	else if (help->help.ct_sip_info.invite_cseq == cseq)
+	else if (ct_sip_info->invite_cseq == cseq)
 		flush_expectations(ct, true);
 	return NF_ACCEPT;
 }
@@ -1091,12 +1091,12 @@ static int process_update_response(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dataoff, dptr, datalen, cseq);
-	else if (help->help.ct_sip_info.invite_cseq == cseq)
+	else if (ct_sip_info->invite_cseq == cseq)
 		flush_expectations(ct, true);
 	return NF_ACCEPT;
 }
@@ -1107,12 +1107,12 @@ static int process_prack_response(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 
 	if ((code >= 100 && code <= 199) ||
 	    (code >= 200 && code <= 299))
 		return process_sdp(skb, dataoff, dptr, datalen, cseq);
-	else if (help->help.ct_sip_info.invite_cseq == cseq)
+	else if (ct_sip_info->invite_cseq == cseq)
 		flush_expectations(ct, true);
 	return NF_ACCEPT;
 }
@@ -1123,13 +1123,13 @@ static int process_invite_request(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 	unsigned int ret;
 
 	flush_expectations(ct, true);
 	ret = process_sdp(skb, dataoff, dptr, datalen, cseq);
 	if (ret == NF_ACCEPT)
-		help->help.ct_sip_info.invite_cseq = cseq;
+		ct_sip_info->invite_cseq = cseq;
 	return ret;
 }
 
@@ -1154,7 +1154,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	unsigned int matchoff, matchlen;
 	struct nf_conntrack_expect *exp;
@@ -1235,7 +1235,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
 
 store_cseq:
 	if (ret == NF_ACCEPT)
-		help->help.ct_sip_info.register_cseq = cseq;
+		ct_sip_info->register_cseq = cseq;
 	return ret;
 }
 
@@ -1245,7 +1245,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	union nf_inet_addr addr;
 	__be16 port;
@@ -1262,7 +1262,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
 	 * responses, so we store the sequence number of the last valid
 	 * request and compare it here.
 	 */
-	if (help->help.ct_sip_info.register_cseq != cseq)
+	if (ct_sip_info->register_cseq != cseq)
 		return NF_ACCEPT;
 
 	if (code >= 100 && code <= 199)
@@ -1578,6 +1578,7 @@ static int __init nf_conntrack_sip_init(void)
 		sip[i][3].help = sip_help_tcp;
 
 		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+			sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
 			sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
 			sip[i][j].expect_policy = sip_exp_policy;
 			sip[i][j].expect_class_max = SIP_EXPECT_MAX;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index a51de9b052be..116018560c60 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -112,6 +112,8 @@ static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)
 		goto err3;
 
 	if (info->helper[0]) {
+		struct nf_conntrack_helper *helper;
+
 		ret = -ENOENT;
 		proto = xt_ct_find_proto(par);
 		if (!proto) {
@@ -120,19 +122,21 @@ static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)
 			goto err3;
 		}
 
-		ret = -ENOMEM;
-		help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
-		if (help == NULL)
-			goto err3;
-
 		ret = -ENOENT;
-		help->helper = nf_conntrack_helper_try_module_get(info->helper,
-								  par->family,
-								  proto);
-		if (help->helper == NULL) {
+		helper = nf_conntrack_helper_try_module_get(info->helper,
+							    par->family,
+							    proto);
+		if (helper == NULL) {
 			pr_info("No such helper \"%s\"\n", info->helper);
 			goto err3;
 		}
+
+		ret = -ENOMEM;
+		help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+		if (help == NULL)
+			goto err3;
+
+		help->helper = helper;
 	}
 
 	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
@@ -202,6 +206,8 @@ static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
 		goto err3;
 
 	if (info->helper[0]) {
+		struct nf_conntrack_helper *helper;
+
 		ret = -ENOENT;
 		proto = xt_ct_find_proto(par);
 		if (!proto) {
@@ -210,19 +216,21 @@ static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
 			goto err3;
 		}
 
-		ret = -ENOMEM;
-		help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
-		if (help == NULL)
-			goto err3;
-
 		ret = -ENOENT;
-		help->helper = nf_conntrack_helper_try_module_get(info->helper,
-								  par->family,
-								  proto);
-		if (help->helper == NULL) {
+		helper = nf_conntrack_helper_try_module_get(info->helper,
+							    par->family,
+							    proto);
+		if (helper == NULL) {
 			pr_info("No such helper \"%s\"\n", info->helper);
 			goto err3;
 		}
+
+		ret = -ENOMEM;
+		help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+		if (help == NULL)
+			goto err3;
+
+		help->helper = helper;
 	}
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-- 
cgit v1.2.3


From 8c88f87cb27ad09086940bdd3e6955e5325ec89a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 7 Jun 2012 13:31:25 +0200
Subject: netfilter: nfnetlink_queue: add NAT TCP sequence adjustment if packet
 mangled

User-space programs that receive traffic via NFQUEUE may mangle packets.
If NAT is enabled, this usually puzzles sequence tracking, leading to
traffic disruptions.

With this patch, nfnl_queue will make the corresponding NAT TCP sequence
adjustment if:

1) The packet has been mangled,
2) the NFQA_CFG_F_CONNTRACK flag has been set, and
3) NAT is detected.

There are some records on the Internet complaning about this issue:
http://stackoverflow.com/questions/260757/packet-mangling-utilities-besides-iptables

By now, we only support TCP since we have no helpers for DCCP or SCTP.
Better to add this if we ever have some helper over those layer 4 protocols.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h             |  2 ++
 include/net/netfilter/nf_nat_helper.h |  4 ++++
 net/ipv4/netfilter/nf_nat_helper.c    | 13 +++++++++++++
 net/netfilter/nf_conntrack_netlink.c  |  4 ++++
 net/netfilter/nfnetlink_queue.c       | 19 +++++++++++--------
 5 files changed, 34 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ba65bfbd7f74..dca19e61b30a 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -401,6 +401,8 @@ struct nfq_ct_hook {
 	size_t (*build_size)(const struct nf_conn *ct);
 	int (*build)(struct sk_buff *skb, struct nf_conn *ct);
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
+	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
+			   u32 ctinfo, int off);
 };
 extern struct nfq_ct_hook *nfq_ct_hook;
 #else
diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h
index 02bb6c29dc3d..7d8fb7b46c44 100644
--- a/include/net/netfilter/nf_nat_helper.h
+++ b/include/net/netfilter/nf_nat_helper.h
@@ -54,4 +54,8 @@ extern void nf_nat_follow_master(struct nf_conn *ct,
 extern s16 nf_nat_get_offset(const struct nf_conn *ct,
 			     enum ip_conntrack_dir dir,
 			     u32 seq);
+
+extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+				  u32 dir, int off);
+
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index af65958f6308..2e59ad0b90ca 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
 
+void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+			   u32 ctinfo, int off)
+{
+	const struct tcphdr *th;
+
+	if (nf_ct_protonum(ct) != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
+	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
+
 static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
 			int datalen, __sum16 *check, int oldlen)
 {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d304d5917950..8be0ab9b4758 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -46,6 +46,7 @@
 #ifdef CONFIG_NF_NAT_NEEDED
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_helper.h>
 #endif
 
 #include <linux/netfilter/nfnetlink.h>
@@ -1751,6 +1752,9 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build_size	= ctnetlink_nfqueue_build_size,
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
+#ifdef CONFIG_NF_NAT_NEEDED
+	.seq_adjust	= nf_nat_tcp_seq_adjust,
+#endif
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE */
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 647923ae9230..ff82c7933dfd 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -502,12 +502,10 @@ err_out:
 }
 
 static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e)
+nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
 {
 	struct sk_buff *nskb;
-	int diff;
 
-	diff = data_len - e->skb->len;
 	if (diff < 0) {
 		if (pskb_trim(e->skb, data_len))
 			return -ENOMEM;
@@ -767,6 +765,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	unsigned int verdict;
 	struct nf_queue_entry *entry;
 	struct nfq_ct_hook *nfq_ct;
+	enum ip_conntrack_info uninitialized_var(ctinfo);
+	struct nf_conn *ct = NULL;
 
 	queue = instance_lookup(queue_num);
 	if (!queue)
@@ -789,20 +789,23 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	nfq_ct = rcu_dereference(nfq_ct_hook);
 	if (nfq_ct != NULL &&
 	    (queue->flags & NFQA_CFG_F_CONNTRACK) && nfqa[NFQA_CT]) {
-		enum ip_conntrack_info ctinfo;
-		struct nf_conn *ct;
-
 		ct = nf_ct_get(entry->skb, &ctinfo);
 		if (ct && !nf_ct_is_untracked(ct))
 			nfq_ct->parse(nfqa[NFQA_CT], ct);
 	}
-	rcu_read_unlock();
 
 	if (nfqa[NFQA_PAYLOAD]) {
+		u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
+		int diff = payload_len - entry->skb->len;
+
 		if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
-				 nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0)
+				 payload_len, entry, diff) < 0)
 			verdict = NF_DROP;
+
+		if (ct && (ct->status & IPS_NAT_MASK) && diff)
+			nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
 	}
+	rcu_read_unlock();
 
 	if (nfqa[NFQA_MARK])
 		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
-- 
cgit v1.2.3


From 12f7a505331e6b2754684b509f2ac8f0011ce644 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 13 May 2012 21:44:54 +0200
Subject: netfilter: add user-space connection tracking helper infrastructure

There are good reasons to supports helpers in user-space instead:

* Rapid connection tracking helper development, as developing code
  in user-space is usually faster.

* Reliability: A buggy helper does not crash the kernel. Moreover,
  we can monitor the helper process and restart it in case of problems.

* Security: Avoid complex string matching and mangling in kernel-space
  running in privileged mode. Going further, we can even think about
  running user-space helpers as a non-root process.

* Extensibility: It allows the development of very specific helpers (most
  likely non-standard proprietary protocols) that are very likely not to be
  accepted for mainline inclusion in the form of kernel-space connection
  tracking helpers.

This patch adds the infrastructure to allow the implementation of
user-space conntrack helpers by means of the new nfnetlink subsystem
`nfnetlink_cthelper' and the existing queueing infrastructure
(nfnetlink_queue).

I had to add the new hook NF_IP6_PRI_CONNTRACK_HELPER to register
ipv[4|6]_helper which results from splitting ipv[4|6]_confirm into
two pieces. This change is required not to break NAT sequence
adjustment and conntrack confirmation for traffic that is enqueued
to our user-space conntrack helpers.

Basic operation, in a few steps:

1) Register user-space helper by means of `nfct':

 nfct helper add ftp inet tcp

 [ It must be a valid existing helper supported by conntrack-tools ]

2) Add rules to enable the FTP user-space helper which is
   used to track traffic going to TCP port 21.

For locally generated packets:

 iptables -I OUTPUT -t raw -p tcp --dport 21 -j CT --helper ftp

For non-locally generated packets:

 iptables -I PREROUTING -t raw -p tcp --dport 21 -j CT --helper ftp

3) Run the test conntrackd in helper mode (see example files under
   doc/helper/conntrackd.conf

 conntrackd

4) Generate FTP traffic going, if everything is OK, then conntrackd
   should create expectations (you can check that with `conntrack':

 conntrack -E expect

    [NEW] 301 proto=6 src=192.168.1.136 dst=130.89.148.12 sport=0 dport=54037 mask-src=255.255.255.255 mask-dst=255.255.255.255 sport=0 dport=65535 master-src=192.168.1.136 master-dst=130.89.148.12 sport=57127 dport=21 class=0 helper=ftp
[DESTROY] 301 proto=6 src=192.168.1.136 dst=130.89.148.12 sport=0 dport=54037 mask-src=255.255.255.255 mask-dst=255.255.255.255 sport=0 dport=65535 master-src=192.168.1.136 master-dst=130.89.148.12 sport=57127 dport=21 class=0 helper=ftp

This confirms that our test helper is receiving packets including the
conntrack information, and adding expectations in kernel-space.

The user-space helper can also store its private tracking information
in the conntrack structure in the kernel via the CTA_HELP_INFO. The
kernel will consider this a binary blob whose layout is unknown. This
information will be included in the information that is transfered
to user-space via glue code that integrates nfnetlink_queue and
ctnetlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/Kbuild                 |   1 +
 include/linux/netfilter/nfnetlink.h            |   3 +-
 include/linux/netfilter/nfnetlink_cthelper.h   |  55 ++
 include/linux/netfilter_ipv4.h                 |   1 +
 include/linux/netfilter_ipv6.h                 |   1 +
 include/net/netfilter/nf_conntrack_helper.h    |  11 +
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  48 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  43 +-
 net/netfilter/Kconfig                          |   8 +
 net/netfilter/Makefile                         |   1 +
 net/netfilter/nf_conntrack_helper.c            |  21 +-
 net/netfilter/nfnetlink_cthelper.c             | 672 +++++++++++++++++++++++++
 12 files changed, 839 insertions(+), 26 deletions(-)
 create mode 100644 include/linux/netfilter/nfnetlink_cthelper.h
 create mode 100644 net/netfilter/nfnetlink_cthelper.c

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 1697036336b6..874ae8f2706b 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -10,6 +10,7 @@ header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
 header-y += nfnetlink_compat.h
 header-y += nfnetlink_conntrack.h
+header-y += nfnetlink_cthelper.h
 header-y += nfnetlink_cttimeout.h
 header-y += nfnetlink_log.h
 header-y += nfnetlink_queue.h
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index a1048c1587d1..18341cdb2443 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -50,7 +50,8 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_IPSET		6
 #define NFNL_SUBSYS_ACCT		7
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
-#define NFNL_SUBSYS_COUNT		9
+#define NFNL_SUBSYS_CTHELPER		9
+#define NFNL_SUBSYS_COUNT		10
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/netfilter/nfnetlink_cthelper.h b/include/linux/netfilter/nfnetlink_cthelper.h
new file mode 100644
index 000000000000..33659f6fad3e
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_cthelper.h
@@ -0,0 +1,55 @@
+#ifndef _NFNL_CTHELPER_H_
+#define _NFNL_CTHELPER_H_
+
+#define NFCT_HELPER_STATUS_DISABLED	0
+#define NFCT_HELPER_STATUS_ENABLED	1
+
+enum nfnl_acct_msg_types {
+	NFNL_MSG_CTHELPER_NEW,
+	NFNL_MSG_CTHELPER_GET,
+	NFNL_MSG_CTHELPER_DEL,
+	NFNL_MSG_CTHELPER_MAX
+};
+
+enum nfnl_cthelper_type {
+	NFCTH_UNSPEC,
+	NFCTH_NAME,
+	NFCTH_TUPLE,
+	NFCTH_QUEUE_NUM,
+	NFCTH_POLICY,
+	NFCTH_PRIV_DATA_LEN,
+	NFCTH_STATUS,
+	__NFCTH_MAX
+};
+#define NFCTH_MAX (__NFCTH_MAX - 1)
+
+enum nfnl_cthelper_policy_type {
+	NFCTH_POLICY_SET_UNSPEC,
+	NFCTH_POLICY_SET_NUM,
+	NFCTH_POLICY_SET,
+	NFCTH_POLICY_SET1	= NFCTH_POLICY_SET,
+	NFCTH_POLICY_SET2,
+	NFCTH_POLICY_SET3,
+	NFCTH_POLICY_SET4,
+	__NFCTH_POLICY_SET_MAX
+};
+#define NFCTH_POLICY_SET_MAX (__NFCTH_POLICY_SET_MAX - 1)
+
+enum nfnl_cthelper_pol_type {
+	NFCTH_POLICY_UNSPEC,
+	NFCTH_POLICY_NAME,
+	NFCTH_POLICY_EXPECT_MAX,
+	NFCTH_POLICY_EXPECT_TIMEOUT,
+	__NFCTH_POLICY_MAX
+};
+#define NFCTH_POLICY_MAX (__NFCTH_POLICY_MAX - 1)
+
+enum nfnl_cthelper_tuple_type {
+	NFCTH_TUPLE_UNSPEC,
+	NFCTH_TUPLE_L3PROTONUM,
+	NFCTH_TUPLE_L4PROTONUM,
+	__NFCTH_TUPLE_MAX,
+};
+#define NFCTH_TUPLE_MAX (__NFCTH_TUPLE_MAX - 1)
+
+#endif /* _NFNL_CTHELPER_H */
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index fa0946c549d3..e2b12801378d 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -66,6 +66,7 @@ enum nf_ip_hook_priorities {
 	NF_IP_PRI_SECURITY = 50,
 	NF_IP_PRI_NAT_SRC = 100,
 	NF_IP_PRI_SELINUX_LAST = 225,
+	NF_IP_PRI_CONNTRACK_HELPER = 300,
 	NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
 	NF_IP_PRI_LAST = INT_MAX,
 };
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 57c025127f1d..7c8a513ce7a3 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -71,6 +71,7 @@ enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_SECURITY = 50,
 	NF_IP6_PRI_NAT_SRC = 100,
 	NF_IP6_PRI_SELINUX_LAST = 225,
+	NF_IP6_PRI_CONNTRACK_HELPER = 300,
 	NF_IP6_PRI_LAST = INT_MAX,
 };
 
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 84b24c3a3834..9aad956d1008 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -15,6 +15,11 @@
 
 struct module;
 
+enum nf_ct_helper_flags {
+	NF_CT_HELPER_F_USERSPACE	= (1 << 0),
+	NF_CT_HELPER_F_CONFIGURED	= (1 << 1),
+};
+
 #define NF_CT_HELPER_NAME_LEN	16
 
 struct nf_conntrack_helper {
@@ -42,6 +47,9 @@ struct nf_conntrack_helper {
 	int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct);
 	int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
 	unsigned int expect_class_max;
+
+	unsigned int flags;
+	unsigned int queue_num;		/* For user-space helpers. */
 };
 
 extern struct nf_conntrack_helper *
@@ -96,4 +104,7 @@ nf_ct_helper_expectfn_find_by_name(const char *name);
 struct nf_ct_helper_expectfn *
 nf_ct_helper_expectfn_find_by_symbol(const void *symbol);
 
+extern struct hlist_head *nf_ct_helper_hash;
+extern unsigned int nf_ct_helper_hsize;
+
 #endif /*_NF_CONNTRACK_HELPER_H*/
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index d79b961a8009..e7ff2dcab6ce 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv4_confirm(unsigned int hooknum,
-				 struct sk_buff *skb,
-				 const struct net_device *in,
-				 const struct net_device *out,
-				 int (*okfn)(struct sk_buff *))
+static unsigned int ipv4_helper(unsigned int hooknum,
+				struct sk_buff *skb,
+				const struct net_device *in,
+				const struct net_device *out,
+				int (*okfn)(struct sk_buff *))
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		goto out;
+		return NF_ACCEPT;
 
 	help = nfct_help(ct);
 	if (!help)
-		goto out;
+		return NF_ACCEPT;
 
 	/* rcu_read_lock()ed by nf_hook_slow */
 	helper = rcu_dereference(help->helper);
 	if (!helper)
-		goto out;
+		return NF_ACCEPT;
 
 	ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
 			   ct, ctinfo);
-	if (ret != NF_ACCEPT) {
+	if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
 		nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
 			      "nf_ct_%s: dropping packet", helper->name);
-		return ret;
 	}
+	return ret;
+}
+
+static unsigned int ipv4_confirm(unsigned int hooknum,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		goto out;
 
 	/* adjust seqs for loopback traffic only in outgoing direction */
 	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -184,6 +198,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP_PRI_CONNTRACK,
 	},
+	{
+		.hook		= ipv4_helper,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
+	},
 	{
 		.hook		= ipv4_confirm,
 		.owner		= THIS_MODULE,
@@ -191,6 +212,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
 	},
+	{
+		.hook		= ipv4_helper,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
+	},
 	{
 		.hook		= ipv4_confirm,
 		.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index fca10da80ea7..4794f96cf2e0 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -143,11 +143,11 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv6_confirm(unsigned int hooknum,
-				 struct sk_buff *skb,
-				 const struct net_device *in,
-				 const struct net_device *out,
-				 int (*okfn)(struct sk_buff *))
+static unsigned int ipv6_helper(unsigned int hooknum,
+				struct sk_buff *skb,
+				const struct net_device *in,
+				const struct net_device *out,
+				int (*okfn)(struct sk_buff *))
 {
 	struct nf_conn *ct;
 	const struct nf_conn_help *help;
@@ -161,15 +161,15 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		goto out;
+		return NF_ACCEPT;
 
 	help = nfct_help(ct);
 	if (!help)
-		goto out;
+		return NF_ACCEPT;
 	/* rcu_read_lock()ed by nf_hook_slow */
 	helper = rcu_dereference(help->helper);
 	if (!helper)
-		goto out;
+		return NF_ACCEPT;
 
 	protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum,
 					 skb->len - extoff);
@@ -179,12 +179,19 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 	}
 
 	ret = helper->help(skb, protoff, ct, ctinfo);
-	if (ret != NF_ACCEPT) {
+	if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
 		nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL,
 			      "nf_ct_%s: dropping packet", helper->name);
-		return ret;
 	}
-out:
+	return ret;
+}
+
+static unsigned int ipv6_confirm(unsigned int hooknum,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *))
+{
 	/* We've seen it coming out the other side: confirm it */
 	return nf_conntrack_confirm(skb);
 }
@@ -253,6 +260,13 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_CONNTRACK,
 	},
+	{
+		.hook		= ipv6_helper,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
+	},
 	{
 		.hook		= ipv6_confirm,
 		.owner		= THIS_MODULE,
@@ -260,6 +274,13 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP6_PRI_LAST,
 	},
+	{
+		.hook		= ipv6_helper,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
+	},
 	{
 		.hook		= ipv6_confirm,
 		.owner		= THIS_MODULE,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 209c1ed43368..aae6c628991d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -12,6 +12,14 @@ tristate "Netfilter NFACCT over NFNETLINK interface"
 	  If this option is enabled, the kernel will include support
 	  for extended accounting via NFNETLINK.
 
+config NETFILTER_NETLINK_CTHELPER
+tristate "Netfilter CTHELPER over NFNETLINK interface"
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_NETLINK
+	help
+	  If this option is enabled, the kernel will include support
+	  for user-space connection tracking helpers via NFNETLINK.
+
 config NETFILTER_NETLINK_QUEUE
 	tristate "Netfilter NFQUEUE over NFNETLINK interface"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4e7960cc7b97..2f3bc0f647ba 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
+obj-$(CONFIG_NETFILTER_NETLINK_CTHELPER) += nfnetlink_cthelper.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
 
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 9c18ecb0ab81..2918ec2e4509 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -30,8 +30,10 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 
 static DEFINE_MUTEX(nf_ct_helper_mutex);
-static struct hlist_head *nf_ct_helper_hash __read_mostly;
-static unsigned int nf_ct_helper_hsize __read_mostly;
+struct hlist_head *nf_ct_helper_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hash);
+unsigned int nf_ct_helper_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
 static unsigned int nf_ct_helper_count __read_mostly;
 
 static bool nf_ct_auto_assign_helper __read_mostly = true;
@@ -322,6 +324,9 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
 
 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 {
+	int ret = 0;
+	struct nf_conntrack_helper *cur;
+	struct hlist_node *n;
 	unsigned int h = helper_hash(&me->tuple);
 
 	BUG_ON(me->expect_policy == NULL);
@@ -329,11 +334,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 	BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
 
 	mutex_lock(&nf_ct_helper_mutex);
+	hlist_for_each_entry(cur, n, &nf_ct_helper_hash[h], hnode) {
+		if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
+		    cur->tuple.src.l3num == me->tuple.src.l3num &&
+		    cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
 	hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
 	nf_ct_helper_count++;
+out:
 	mutex_unlock(&nf_ct_helper_mutex);
-
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
 
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
new file mode 100644
index 000000000000..d6836193d479
--- /dev/null
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -0,0 +1,672 @@
+/*
+ * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ *
+ * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nfnetlink_cthelper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
+
+static int
+nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
+			struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	const struct nf_conn_help *help;
+	struct nf_conntrack_helper *helper;
+
+	help = nfct_help(ct);
+	if (help == NULL)
+		return NF_DROP;
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	helper = rcu_dereference(help->helper);
+	if (helper == NULL)
+		return NF_DROP;
+
+	/* This is an user-space helper not yet configured, skip. */
+	if ((helper->flags &
+	    (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
+	     NF_CT_HELPER_F_USERSPACE)
+		return NF_ACCEPT;
+
+	/* If the user-space helper is not available, don't block traffic. */
+	return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS;
+}
+
+static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = {
+	[NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, },
+	[NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, },
+};
+
+static int
+nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
+			  const struct nlattr *attr)
+{
+	struct nlattr *tb[NFCTH_TUPLE_MAX+1];
+
+	nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
+
+	if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
+		return -EINVAL;
+
+	tuple->src.l3num = ntohs(nla_get_u16(tb[NFCTH_TUPLE_L3PROTONUM]));
+	tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
+
+	return 0;
+}
+
+static int
+nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+	const struct nf_conn_help *help = nfct_help(ct);
+
+	if (help->helper->data_len == 0)
+		return -EINVAL;
+
+	memcpy(&help->data, nla_data(attr), help->helper->data_len);
+	return 0;
+}
+
+static int
+nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	const struct nf_conn_help *help = nfct_help(ct);
+
+	if (help->helper->data_len &&
+	    nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -ENOSPC;
+}
+
+static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = {
+	[NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING,
+				.len = NF_CT_HELPER_NAME_LEN-1 },
+	[NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, },
+	[NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
+			    const struct nlattr *attr)
+{
+	struct nlattr *tb[NFCTH_POLICY_MAX+1];
+
+	nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
+
+	if (!tb[NFCTH_POLICY_NAME] ||
+	    !tb[NFCTH_POLICY_EXPECT_MAX] ||
+	    !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+		return -EINVAL;
+
+	strncpy(expect_policy->name,
+		nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+	expect_policy->max_expected =
+		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+	expect_policy->timeout =
+		ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+	return 0;
+}
+
+static const struct nla_policy
+nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = {
+	[NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
+				  const struct nlattr *attr)
+{
+	int i, ret;
+	struct nf_conntrack_expect_policy *expect_policy;
+	struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+
+	nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+					nfnl_cthelper_expect_policy_set);
+
+	if (!tb[NFCTH_POLICY_SET_NUM])
+		return -EINVAL;
+
+	helper->expect_class_max =
+		ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+
+	if (helper->expect_class_max != 0 &&
+	    helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+		return -EOVERFLOW;
+
+	expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
+				helper->expect_class_max, GFP_KERNEL);
+	if (expect_policy == NULL)
+		return -ENOMEM;
+
+	for (i=0; i<helper->expect_class_max; i++) {
+		if (!tb[NFCTH_POLICY_SET+i])
+			goto err;
+
+		ret = nfnl_cthelper_expect_policy(&expect_policy[i],
+						  tb[NFCTH_POLICY_SET+i]);
+		if (ret < 0)
+			goto err;
+	}
+	helper->expect_policy = expect_policy;
+	return 0;
+err:
+	kfree(expect_policy);
+	return -EINVAL;
+}
+
+static int
+nfnl_cthelper_create(const struct nlattr * const tb[],
+		     struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_helper *helper;
+	int ret;
+
+	if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
+		return -EINVAL;
+
+	helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
+	if (helper == NULL)
+		return -ENOMEM;
+
+	ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
+	if (ret < 0)
+		goto err;
+
+	strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+	helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+	helper->flags |= NF_CT_HELPER_F_USERSPACE;
+	memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
+
+	helper->me = THIS_MODULE;
+	helper->help = nfnl_userspace_cthelper;
+	helper->from_nlattr = nfnl_cthelper_from_nlattr;
+	helper->to_nlattr = nfnl_cthelper_to_nlattr;
+
+	/* Default to queue number zero, this can be updated at any time. */
+	if (tb[NFCTH_QUEUE_NUM])
+		helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+	if (tb[NFCTH_STATUS]) {
+		int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+		switch(status) {
+		case NFCT_HELPER_STATUS_ENABLED:
+			helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+			break;
+		case NFCT_HELPER_STATUS_DISABLED:
+			helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+			break;
+		}
+	}
+
+	ret = nf_conntrack_helper_register(helper);
+	if (ret < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(helper);
+	return ret;
+}
+
+static int
+nfnl_cthelper_update(const struct nlattr * const tb[],
+		     struct nf_conntrack_helper *helper)
+{
+	int ret;
+
+	if (tb[NFCTH_PRIV_DATA_LEN])
+		return -EBUSY;
+
+	if (tb[NFCTH_POLICY]) {
+		ret = nfnl_cthelper_parse_expect_policy(helper,
+							tb[NFCTH_POLICY]);
+		if (ret < 0)
+			return ret;
+	}
+	if (tb[NFCTH_QUEUE_NUM])
+		helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+	if (tb[NFCTH_STATUS]) {
+		int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+		switch(status) {
+		case NFCT_HELPER_STATUS_ENABLED:
+			helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+			break;
+		case NFCT_HELPER_STATUS_DISABLED:
+			helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int
+nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb,
+		  const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	const char *helper_name;
+	struct nf_conntrack_helper *cur, *helper = NULL;
+	struct nf_conntrack_tuple tuple;
+	struct hlist_node *n;
+	int ret = 0, i;
+
+	if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
+		return -EINVAL;
+
+	helper_name = nla_data(tb[NFCTH_NAME]);
+
+	ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+	if (ret < 0)
+		return ret;
+
+	rcu_read_lock();
+	for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
+		hlist_for_each_entry_rcu(cur, n, &nf_ct_helper_hash[i], hnode) {
+
+			/* skip non-userspace conntrack helpers. */
+			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+				continue;
+
+			if (strncmp(cur->name, helper_name,
+					NF_CT_HELPER_NAME_LEN) != 0)
+				continue;
+
+			if ((tuple.src.l3num != cur->tuple.src.l3num ||
+			     tuple.dst.protonum != cur->tuple.dst.protonum))
+				continue;
+
+			if (nlh->nlmsg_flags & NLM_F_EXCL) {
+				ret = -EEXIST;
+				goto err;
+			}
+			helper = cur;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (helper == NULL)
+		ret = nfnl_cthelper_create(tb, &tuple);
+	else
+		ret = nfnl_cthelper_update(tb, helper);
+
+	return ret;
+err:
+	rcu_read_unlock();
+	return ret;
+}
+
+static int
+nfnl_cthelper_dump_tuple(struct sk_buff *skb,
+			 struct nf_conntrack_helper *helper)
+{
+	struct nlattr *nest_parms;
+
+	nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED);
+	if (nest_parms == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM,
+			 htons(helper->tuple.src.l3num)))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest_parms);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+nfnl_cthelper_dump_policy(struct sk_buff *skb,
+			struct nf_conntrack_helper *helper)
+{
+	int i;
+	struct nlattr *nest_parms1, *nest_parms2;
+
+	nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED);
+	if (nest_parms1 == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
+			 htonl(helper->expect_class_max)))
+		goto nla_put_failure;
+
+	for (i=0; i<helper->expect_class_max; i++) {
+		nest_parms2 = nla_nest_start(skb,
+				(NFCTH_POLICY_SET+i) | NLA_F_NESTED);
+		if (nest_parms2 == NULL)
+			goto nla_put_failure;
+
+		if (nla_put_string(skb, NFCTH_POLICY_NAME,
+				   helper->expect_policy[i].name))
+			goto nla_put_failure;
+
+		if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX,
+				 htonl(helper->expect_policy[i].max_expected)))
+			goto nla_put_failure;
+
+		if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT,
+				 htonl(helper->expect_policy[i].timeout)))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, nest_parms2);
+	}
+	nla_nest_end(skb, nest_parms1);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+nfnl_cthelper_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type,
+			int event, struct nf_conntrack_helper *helper)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = pid ? NLM_F_MULTI : 0;
+	int status;
+
+	event |= NFNL_SUBSYS_CTHELPER << 8;
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = AF_UNSPEC;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	if (nla_put_string(skb, NFCTH_NAME, helper->name))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num)))
+		goto nla_put_failure;
+
+	if (nfnl_cthelper_dump_tuple(skb, helper) < 0)
+		goto nla_put_failure;
+
+	if (nfnl_cthelper_dump_policy(skb, helper) < 0)
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len)))
+		goto nla_put_failure;
+
+	if (helper->flags & NF_CT_HELPER_F_CONFIGURED)
+		status = NFCT_HELPER_STATUS_ENABLED;
+	else
+		status = NFCT_HELPER_STATUS_DISABLED;
+
+	if (nla_put_be32(skb, NFCTH_STATUS, htonl(status)))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int
+nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nf_conntrack_helper *cur, *last;
+	struct hlist_node *n;
+
+	rcu_read_lock();
+	last = (struct nf_conntrack_helper *)cb->args[1];
+	for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) {
+restart:
+		hlist_for_each_entry_rcu(cur, n,
+				&nf_ct_helper_hash[cb->args[0]], hnode) {
+
+			/* skip non-userspace conntrack helpers. */
+			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+				continue;
+
+			if (cb->args[1]) {
+				if (cur != last)
+					continue;
+				cb->args[1] = 0;
+			}
+			if (nfnl_cthelper_fill_info(skb,
+					    NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+					    NFNL_MSG_CTHELPER_NEW, cur) < 0) {
+				cb->args[1] = (unsigned long)cur;
+				goto out;
+			}
+		}
+	}
+	if (cb->args[1]) {
+		cb->args[1] = 0;
+		goto restart;
+	}
+out:
+	rcu_read_unlock();
+	return skb->len;
+}
+
+static int
+nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
+		  const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret = -ENOENT, i;
+	struct nf_conntrack_helper *cur;
+	struct hlist_node *n;
+	struct sk_buff *skb2;
+	char *helper_name = NULL;
+	struct nf_conntrack_tuple tuple;
+	bool tuple_set = false;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nfnl_cthelper_dump_table,
+		};
+		return netlink_dump_start(nfnl, skb, nlh, &c);
+	}
+
+	if (tb[NFCTH_NAME])
+		helper_name = nla_data(tb[NFCTH_NAME]);
+
+	if (tb[NFCTH_TUPLE]) {
+		ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+		if (ret < 0)
+			return ret;
+
+		tuple_set = true;
+	}
+
+	for (i = 0; i < nf_ct_helper_hsize; i++) {
+		hlist_for_each_entry_rcu(cur, n, &nf_ct_helper_hash[i], hnode) {
+
+			/* skip non-userspace conntrack helpers. */
+			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+				continue;
+
+			if (helper_name && strncmp(cur->name, helper_name,
+						NF_CT_HELPER_NAME_LEN) != 0) {
+				continue;
+			}
+			if (tuple_set &&
+			    (tuple.src.l3num != cur->tuple.src.l3num ||
+			     tuple.dst.protonum != cur->tuple.dst.protonum))
+				continue;
+
+			skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+			if (skb2 == NULL) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).pid,
+						nlh->nlmsg_seq,
+						NFNL_MSG_TYPE(nlh->nlmsg_type),
+						NFNL_MSG_CTHELPER_NEW, cur);
+			if (ret <= 0) {
+				kfree_skb(skb2);
+				break;
+			}
+
+			ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).pid,
+						MSG_DONTWAIT);
+			if (ret > 0)
+				ret = 0;
+
+			/* this avoids a loop in nfnetlink. */
+			return ret == -EAGAIN ? -ENOBUFS : ret;
+		}
+	}
+	return ret;
+}
+
+static int
+nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	char *helper_name = NULL;
+	struct nf_conntrack_helper *cur;
+	struct hlist_node *n, *tmp;
+	struct nf_conntrack_tuple tuple;
+	bool tuple_set = false, found = false;
+	int i, j = 0, ret;
+
+	if (tb[NFCTH_NAME])
+		helper_name = nla_data(tb[NFCTH_NAME]);
+
+	if (tb[NFCTH_TUPLE]) {
+		ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+		if (ret < 0)
+			return ret;
+
+		tuple_set = true;
+	}
+
+	for (i = 0; i < nf_ct_helper_hsize; i++) {
+		hlist_for_each_entry_safe(cur, n, tmp, &nf_ct_helper_hash[i],
+								hnode) {
+			/* skip non-userspace conntrack helpers. */
+			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+				continue;
+
+			j++;
+
+			if (helper_name && strncmp(cur->name, helper_name,
+						NF_CT_HELPER_NAME_LEN) != 0) {
+				continue;
+			}
+			if (tuple_set &&
+			    (tuple.src.l3num != cur->tuple.src.l3num ||
+			     tuple.dst.protonum != cur->tuple.dst.protonum))
+				continue;
+
+			found = true;
+			nf_conntrack_helper_unregister(cur);
+		}
+	}
+	/* Make sure we return success if we flush and there is no helpers */
+	return (found || j == 0) ? 0 : -ENOENT;
+}
+
+static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
+	[NFCTH_NAME] = { .type = NLA_NUL_STRING,
+			 .len = NF_CT_HELPER_NAME_LEN-1 },
+	[NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+};
+
+static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
+	[NFNL_MSG_CTHELPER_NEW]		= { .call = nfnl_cthelper_new,
+					    .attr_count = NFCTH_MAX,
+					    .policy = nfnl_cthelper_policy },
+	[NFNL_MSG_CTHELPER_GET]		= { .call = nfnl_cthelper_get,
+					    .attr_count = NFCTH_MAX,
+					    .policy = nfnl_cthelper_policy },
+	[NFNL_MSG_CTHELPER_DEL]		= { .call = nfnl_cthelper_del,
+					    .attr_count = NFCTH_MAX,
+					    .policy = nfnl_cthelper_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
+	.name				= "cthelper",
+	.subsys_id			= NFNL_SUBSYS_CTHELPER,
+	.cb_count			= NFNL_MSG_CTHELPER_MAX,
+	.cb				= nfnl_cthelper_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER);
+
+static int __init nfnl_cthelper_init(void)
+{
+	int ret;
+
+	ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys);
+	if (ret < 0) {
+		pr_err("nfnl_cthelper: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+	return 0;
+err_out:
+	return ret;
+}
+
+static void __exit nfnl_cthelper_exit(void)
+{
+	struct nf_conntrack_helper *cur;
+	struct hlist_node *n, *tmp;
+	int i;
+
+	nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
+
+	for (i=0; i<nf_ct_helper_hsize; i++) {
+		hlist_for_each_entry_safe(cur, n, tmp, &nf_ct_helper_hash[i],
+									hnode) {
+			/* skip non-userspace conntrack helpers. */
+			if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+				continue;
+
+			nf_conntrack_helper_unregister(cur);
+		}
+	}
+}
+
+module_init(nfnl_cthelper_init);
+module_exit(nfnl_cthelper_exit);
-- 
cgit v1.2.3


From 6fac262526ee91ee66210b8919a4297dcf7d544e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 17 Jun 2012 19:47:34 -0700
Subject: ipv4: Cap ADVMSS metric in the FIB rather than the routing cache.

It makes no sense to execute this limit test every time we create a
routing cache entry.

We can't simply error out on these things since we've silently
accepted and truncated them forever.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 7 ++++++-
 net/ipv4/route.c         | 2 --
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e5b7182fa099..415f8230fc88 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -779,9 +779,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			int type = nla_type(nla);
 
 			if (type) {
+				u32 val;
+
 				if (type > RTAX_MAX)
 					goto err_inval;
-				fi->fib_metrics[type - 1] = nla_get_u32(nla);
+				val = nla_get_u32(nla);
+				if (type == RTAX_ADVMSS && val > 65535 - 40)
+					val = 65535 - 40;
+				fi->fib_metrics[type - 1] = val;
 			}
 		}
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 41df5297a412..a91f6d33804c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1951,8 +1951,6 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 	if (dst_mtu(dst) > IP_MAX_MTU)
 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
-		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-- 
cgit v1.2.3


From f9242b6b28d61295f2bf7e8adfb1060b382e5381 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Jun 2012 18:56:21 -0700
Subject: inet: Sanitize inet{,6} protocol demux.

Don't pretend that inet_protos[] and inet6_protos[] are hashes, thay
are just a straight arrays.  Remove all unnecessary hash masking.

Document MAX_INET_PROTOS.

Use RAW_HTABLE_SIZE when appropriate.

Reported-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |  7 +++++--
 net/ipv4/af_inet.c     | 26 ++++++++++++--------------
 net/ipv4/icmp.c        |  9 ++++-----
 net/ipv4/ip_input.c    |  5 ++---
 net/ipv4/protocol.c    |  8 +++-----
 net/ipv6/icmp.c        |  7 ++-----
 net/ipv6/ip6_input.c   |  9 +++------
 net/ipv6/protocol.c    |  8 +++-----
 net/ipv6/raw.c         |  4 ++--
 9 files changed, 36 insertions(+), 47 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 875f4895b033..a1b1b530c338 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -29,8 +29,11 @@
 #include <linux/ipv6.h>
 #endif
 
-#define MAX_INET_PROTOS	256		/* Must be a power of 2		*/
-
+/* This is one larger than the largest protocol value that can be
+ * found in an ipv4 or ipv6 header.  Since in both cases the protocol
+ * value is presented in a __u8, this is defined to be 256.
+ */
+#define MAX_INET_PROTOS		256
 
 /* This is used to register protocols. */
 struct net_protocol {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4e8e00a2c91..85a3b1763136 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -242,20 +242,18 @@ void build_ehash_secret(void)
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
 {
-	int hash;
 	const struct net_protocol *ipprot;
 
 	if (net_eq(net, &init_net))
 		return 1;
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-	ipprot = rcu_dereference(inet_protos[hash]);
-
-	if (ipprot == NULL)
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot == NULL) {
 		/* raw IP is OK */
 		return 1;
+	}
 	return ipprot->netns_ok;
 }
 
@@ -1216,8 +1214,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-	const struct iphdr *iph;
 	const struct net_protocol *ops;
+	const struct iphdr *iph;
 	int proto;
 	int ihl;
 	int err = -EINVAL;
@@ -1236,7 +1234,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
 	__skb_pull(skb, ihl);
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	err = -EPROTONOSUPPORT;
 
 	rcu_read_lock();
@@ -1253,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	struct iphdr *iph;
 	const struct net_protocol *ops;
+	struct iphdr *iph;
 	int proto;
 	int ihl;
 	int id;
@@ -1286,7 +1284,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
 	id = ntohs(iph->id);
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	rcu_read_lock();
@@ -1340,7 +1338,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	proto = iph->protocol;
 
 	rcu_read_lock();
 	ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1396,11 @@ out:
 
 static int inet_gro_complete(struct sk_buff *skb)
 {
-	const struct net_protocol *ops;
+	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 	struct iphdr *iph = ip_hdr(skb);
-	int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	const struct net_protocol *ops;
+	int proto = iph->protocol;
 	int err = -ENOSYS;
-	__be16 newlen = htons(skb->len - skb_network_offset(skb));
 
 	csum_replace2(&iph->check, iph->tot_len, newlen);
 	iph->tot_len = newlen;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e1caa1abe5d1..49a74cc79dc8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -637,12 +637,12 @@ EXPORT_SYMBOL(icmp_send);
 
 static void icmp_unreach(struct sk_buff *skb)
 {
+	const struct net_protocol *ipprot;
 	const struct iphdr *iph;
 	struct icmphdr *icmph;
-	int hash, protocol;
-	const struct net_protocol *ipprot;
-	u32 info = 0;
 	struct net *net;
+	u32 info = 0;
+	int protocol;
 
 	net = dev_net(skb_dst(skb)->dev);
 
@@ -731,9 +731,8 @@ static void icmp_unreach(struct sk_buff *skb)
 	 */
 	raw_icmp_error(skb, protocol, info);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet_protos[hash]);
+	ipprot = rcu_dereference(inet_protos[protocol]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, info);
 	rcu_read_unlock();
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144ca330..c4fe1d271131 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
 	rcu_read_lock();
 	{
 		int protocol = ip_hdr(skb)->protocol;
-		int hash, raw;
 		const struct net_protocol *ipprot;
+		int raw;
 
 	resubmit:
 		raw = raw_local_deliver(skb, protocol);
 
-		hash = protocol & (MAX_INET_PROTOS - 1);
-		ipprot = rcu_dereference(inet_protos[hash]);
+		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot != NULL) {
 			int ret;
 
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b2..8918eff1426d 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
 
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 5247d5c211f9..c7da1422cbde 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -600,9 +600,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 {
 	const struct inet6_protocol *ipprot;
 	int inner_offset;
-	int hash;
-	u8 nexthdr;
 	__be16 frag_off;
+	u8 nexthdr;
 
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		return;
@@ -629,10 +628,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 	   --ANK (980726)
 	 */
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-
 	rcu_read_lock();
-	ipprot = rcu_dereference(inet6_protos[hash]);
+	ipprot = rcu_dereference(inet6_protos[nexthdr]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
 	rcu_read_unlock();
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 21a15dfe4a9e..5ab923e51af3 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -168,13 +168,12 @@ drop:
 
 static int ip6_input_finish(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	const struct inet6_protocol *ipprot;
+	struct inet6_dev *idev;
 	unsigned int nhoff;
 	int nexthdr;
 	bool raw;
-	u8 hash;
-	struct inet6_dev *idev;
-	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Parse extension headers
@@ -189,9 +188,7 @@ resubmit:
 	nexthdr = skb_network_header(skb)[nhoff];
 
 	raw = raw6_local_deliver(skb, nexthdr);
-
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
-	if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
+	if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) {
 		int ret;
 
 		if (ipprot->flags & INET6_PROTO_FINAL) {
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 9a7978fdc02a..053082dfc93e 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -29,9 +29,7 @@ const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int hash = protocol & (MAX_INET_PROTOS - 1);
-
-	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 			NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet6_add_protocol);
@@ -42,9 +40,9 @@ EXPORT_SYMBOL(inet6_add_protocol);
 
 int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret;
 
-	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
 		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 43b0042f15f4..b5c1dcb27737 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -165,7 +165,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	saddr = &ipv6_hdr(skb)->saddr;
 	daddr = saddr + 1;
 
-	hash = nexthdr & (MAX_INET_PROTOS - 1);
+	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
 
 	read_lock(&raw_v6_hashinfo.lock);
 	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
@@ -229,7 +229,7 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct sock *raw_sk;
 
-	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]);
+	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
 	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
 		raw_sk = NULL;
 
-- 
cgit v1.2.3


From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Jun 2012 21:22:05 -0700
Subject: ipv4: Early TCP socket demux.

Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h |  4 ++--
 include/net/protocol.h        |  1 +
 include/net/sock.h            |  2 ++
 include/net/tcp.h             |  1 +
 net/core/sock.c               |  5 +++++
 net/ipv4/af_inet.c            | 18 +++++++++--------
 net/ipv4/ip_input.c           | 39 ++++++++++++++++++++++++------------
 net/ipv4/tcp_input.c          | 16 ++++++++++++++-
 net/ipv4/tcp_ipv4.c           | 46 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c      |  2 ++
 10 files changed, 110 insertions(+), 24 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f76b03..54be0287eb98 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const __be16 sport,
 					     const __be16 dport)
 {
-	struct sock *sk;
+	struct sock *sk = skb_steal_sock(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
+	if (sk)
 		return sk;
 	else
 		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a1b1b530c338..967b926cbfb1 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,6 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a4521699563..87b424ae750a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f342259a..6660ffc4963d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71fda6ec..929bdcc2383b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 85a3b1763136..07a02f6e9696 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c4fe1d271131..93b092c9a394 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int err;
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[protocol]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8b..8416f8a68e65 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	int res;
 
+	if (sk->sk_rx_dst) {
+		struct dst_entry *dst = sk->sk_rx_dst;
+		if (unlikely(dst->obsolete)) {
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+	}
+	if (unlikely(sk->sk_rx_dst == NULL))
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
-	if (skb != NULL)
+	if (skb != NULL) {
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
 		security_inet_conn_established(sk, skb);
+	}
 
 	/* Make sure socket is routed, for correct metrics.  */
 	icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca17135e..13857df1dae1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb015317c9f7..72b7c63b1a39 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.
-- 
cgit v1.2.3


From da55737467c1c3bc02271039c088171d82e0796f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Jun 2012 04:02:10 +0000
Subject: inetpeer: inetpeer_invalidate_tree() cleanup

No need to use cmpxchg() in inetpeer_invalidate_tree() since we hold
base lock.

Also use correct rcu annotations to remove sparse errors
(CONFIG_SPARSE_RCU_POINTER=y)

net/ipv4/inetpeer.c:144:19: error: incompatible types in comparison
expression (different address spaces)
net/ipv4/inetpeer.c:149:20: error: incompatible types in comparison
expression (different address spaces)
net/ipv4/inetpeer.c:595:10: error: incompatible types in comparison
expression (different address spaces)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index cac02ad1425d..da90a8cab614 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -126,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min
 
 static void inetpeer_gc_worker(struct work_struct *work)
 {
-	struct inet_peer *p, *n;
+	struct inet_peer *p, *n, *c;
 	LIST_HEAD(list);
 
 	spin_lock_bh(&gc_lock);
@@ -138,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
 
 	list_for_each_entry_safe(p, n, &list, gc_list) {
 
-		if(need_resched())
+		if (need_resched())
 			cond_resched();
 
-		if (p->avl_left != peer_avl_empty) {
-			list_add_tail(&p->avl_left->gc_list, &list);
-			p->avl_left = peer_avl_empty;
+		c = rcu_dereference_protected(p->avl_left, 1);
+		if (c != peer_avl_empty) {
+			list_add_tail(&c->gc_list, &list);
+			p->avl_left = peer_avl_empty_rcu;
 		}
 
-		if (p->avl_right != peer_avl_empty) {
-			list_add_tail(&p->avl_right->gc_list, &list);
-			p->avl_right = peer_avl_empty;
+		c = rcu_dereference_protected(p->avl_right, 1);
+		if (c != peer_avl_empty) {
+			list_add_tail(&c->gc_list, &list);
+			p->avl_right = peer_avl_empty_rcu;
 		}
 
 		n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -587,23 +589,17 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
 
 void inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
-	struct inet_peer *old, *new, *prev;
+	struct inet_peer *root;
 
 	write_seqlock_bh(&base->lock);
 
-	old = base->root;
-	if (old == peer_avl_empty_rcu)
-		goto out;
-
-	new = peer_avl_empty_rcu;
-
-	prev = cmpxchg(&base->root, old, new);
-	if (prev == old) {
+	root = rcu_deref_locked(base->root, base);
+	if (root != peer_avl_empty) {
+		base->root = peer_avl_empty_rcu;
 		base->total = 0;
-		call_rcu(&prev->gc_rcu, inetpeer_inval_rcu);
+		call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
 	}
 
-out:
 	write_sequnlock_bh(&base->lock);
 }
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
-- 
cgit v1.2.3


From fd62e09b946522ec3578412826a81bead06fadf7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 21 Jun 2012 14:58:10 -0700
Subject: tcp: Validate route interface in early demux.

Otherwise we might violate reverse path filtering.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13857df1dae1..21e22a00481a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1676,6 +1676,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
+	struct net_device *dev;
 	struct sock *sk;
 	int err;
 
@@ -1695,10 +1696,11 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
 		goto out_err;
 
+	dev = skb->dev;
 	sk = __inet_lookup_established(net, &tcp_hashinfo,
 				       iph->saddr, th->source,
 				       iph->daddr, th->dest,
-				       skb->dev->ifindex);
+				       dev->ifindex);
 	if (sk) {
 		skb->sk = sk;
 		skb->destructor = sock_edemux;
@@ -1707,8 +1709,12 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 			if (dst)
 				dst = dst_check(dst, 0);
 			if (dst) {
-				skb_dst_set_noref(skb, dst);
-				err = 0;
+				struct rtable *rt = (struct rtable *) dst;
+
+				if (rt->rt_iif == dev->ifindex) {
+					skb_dst_set_noref(skb, dst);
+					err = 0;
+				}
 			}
 		}
 	}
-- 
cgit v1.2.3


From d584a61a931e6cbfef0dd811c4ae0250ec5987f4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 20 Jun 2012 20:52:31 +0200
Subject: netfilter: nfnetlink_queue: fix compilation with CONFIG_NF_NAT=m and
 CONFIG_NF_CT_NETLINK=y

  LD      init/built-in.o
net/built-in.o:(.data+0x4408): undefined reference to `nf_nat_tcp_seq_adjust'
make: *** [vmlinux] Error 1

This patch adds a new pointer hook (nfq_ct_nat_hook) similar to other existing
in Netfilter to solve our complicated configuration dependencies.

Reported-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h            | 6 +++++-
 net/ipv4/netfilter/nf_nat_core.c     | 6 ++++++
 net/netfilter/core.c                 | 3 +++
 net/netfilter/nf_conntrack_netlink.c | 3 ---
 net/netfilter/nfnetlink_queue_ct.c   | 8 ++++----
 5 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 38b96a54f9a5..c613cf0d7884 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -401,10 +401,14 @@ struct nfq_ct_hook {
 	size_t (*build_size)(const struct nf_conn *ct);
 	int (*build)(struct sk_buff *skb, struct nf_conn *ct);
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
+};
+extern struct nfq_ct_hook __rcu *nfq_ct_hook;
+
+struct nfq_ct_nat_hook {
 	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
 			   u32 ctinfo, int off);
 };
-extern struct nfq_ct_hook __rcu *nfq_ct_hook;
+extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook;
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index abb52adf5acd..44b082fd48ab 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 	.expectfn	= nf_nat_follow_master,
 };
 
+static struct nfq_ct_nat_hook nfq_ct_nat = {
+	.seq_adjust	= nf_nat_tcp_seq_adjust,
+};
+
 static int __init nf_nat_init(void)
 {
 	size_t i;
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void)
 			   nfnetlink_parse_nat_setup);
 	BUG_ON(nf_ct_nat_offset != NULL);
 	RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
+	RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
 	return 0;
 
  cleanup_extend:
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void)
 	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
 	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
+	RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
 	synchronize_net();
 }
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 4cd10ed2d6e6..0bc6b60db4df 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -268,6 +268,9 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
 struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfq_ct_hook);
 
+struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
+
 #endif /* CONFIG_NF_CONNTRACK */
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 31d1d8f3a6ce..8bb47339b770 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1757,9 +1757,6 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build_size	= ctnetlink_nfqueue_build_size,
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
-#ifdef CONFIG_NF_NAT_NEEDED
-	.seq_adjust	= nf_nat_tcp_seq_adjust,
-#endif
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
index 68ef550066f5..01247b730e66 100644
--- a/net/netfilter/nfnetlink_queue_ct.c
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -86,12 +86,12 @@ nla_put_failure:
 void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo, int diff)
 {
-	struct nfq_ct_hook *nfq_ct;
+	struct nfq_ct_nat_hook *nfq_nat_ct;
 
-	nfq_ct = rcu_dereference(nfq_ct_hook);
-	if (nfq_ct == NULL)
+	nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook);
+	if (nfq_nat_ct == NULL)
 		return;
 
 	if ((ct->status & IPS_NAT_MASK) && diff)
-		nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+		nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff);
 }
-- 
cgit v1.2.3


From 6648bd7e0e62c0c8c03b15e00c9e7015e232feff Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 21 Jun 2012 13:58:31 +0000
Subject: ipv4: Add sysctl knob to control early socket demux

This change is meant to add a control for disabling early socket demux.
The main motivation behind this patch is to provide an option to disable
the feature as it adds an additional cost to routing that reduces overall
throughput by up to 5%.  For example one of my systems went from 12.1Mpps
to 11.6 after the early socket demux was added.  It looks like the reason
for the regression is that we are now having to perform two lookups, first
the one for an established socket, and then the one for the routing table.

By adding this patch and toggling the value for ip_early_demux to 0 I am
able to get back to the 12.1Mpps I was previously seeing.

[ Move local variables in ip_rcv_finish() down into the basic
  block in which they are actually used.  -DaveM ]

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  1 +
 include/net/ip.h           |  3 +++
 kernel/sysctl_binary.c     |  2 ++
 net/ipv4/ip_input.c        | 22 +++++++++++++---------
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 5 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index c34b4c82b0dc..20825e5f433f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,7 @@ enum
 	NET_TCP_ALLOWED_CONG_CONTROL=123,
 	NET_TCP_MAX_SSTHRESH=124,
 	NET_TCP_FRTO_RESPONSE=125,
+	NET_IPV4_EARLY_DEMUX=126,
 };
 
 enum {
diff --git a/include/net/ip.h b/include/net/ip.h
index 83e0619f59d0..50841bd6f10e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -210,6 +210,9 @@ extern int inet_peer_threshold;
 extern int inet_peer_minttl;
 extern int inet_peer_maxttl;
 
+/* From ip_input.c */
+extern int sysctl_ip_early_demux;
+
 /* From ip_output.c */
 extern int sysctl_ip_dynaddr;
 
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..6a3cf8253aae 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = {
 	{ CTL_INT,	NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
 	/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
 
+	{ CTL_INT,	NET_IPV4_EARLY_DEMUX,			"ip_early_demux" },
+
 	{ CTL_INT,	2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
 
 	/* NET_TCP_DEFAULT_WIN_SCALE unused */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 93b092c9a394..bca25179cdb9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,8 @@ drop:
 	return true;
 }
 
+int sysctl_ip_early_demux __read_mostly = 1;
+
 static int ip_rcv_finish(struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
@@ -323,16 +325,18 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		const struct net_protocol *ipprot;
-		int protocol = iph->protocol;
-		int err;
+		int err = -ENOENT;
 
-		rcu_read_lock();
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		err = -ENOENT;
-		if (ipprot && ipprot->early_demux)
-			err = ipprot->early_demux(skb);
-		rcu_read_unlock();
+		if (sysctl_ip_early_demux) {
+			const struct net_protocol *ipprot;
+			int protocol = iph->protocol;
+
+			rcu_read_lock();
+			ipprot = rcu_dereference(inet_protos[protocol]);
+			if (ipprot && ipprot->early_demux)
+				err = ipprot->early_demux(skb);
+			rcu_read_unlock();
+		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed655..12aa0c5867c4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_early_demux",
+		.data		= &sysctl_ip_early_demux,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_dynaddr",
 		.data		= &sysctl_ip_dynaddr,
-- 
cgit v1.2.3


From 7586eceb0abc0ea1c2b023e3e5d4dfd4ff40930a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 20 Jun 2012 05:02:19 +0000
Subject: ipv4: tcp: dont cache output dst for syncookies

Don't cache output dst for syncookies, as this adds pressure on IP route
cache and rcu subsystem for no gain.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h                 |  1 +
 include/net/inet_connection_sock.h |  3 ++-
 net/dccp/ipv4.c                    |  2 +-
 net/ipv4/inet_connection_sock.c    |  8 ++++++--
 net/ipv4/route.c                   |  5 ++++-
 net/ipv4/tcp_ipv4.c                | 12 +++++++-----
 6 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/flow.h b/include/net/flow.h
index 6c469dbdb917..bd524f598561 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -22,6 +22,7 @@ struct flowi_common {
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_PRECOW_METRICS	0x02
 #define FLOWI_FLAG_CAN_SLEEP		0x04
+#define FLOWI_FLAG_RT_NOCACHE		0x08
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e1b7734c456f..af3c743a40e4 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -251,7 +251,8 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 extern struct dst_entry* inet_csk_route_req(struct sock *sk,
 					    struct flowi4 *fl4,
-					    const struct request_sock *req);
+					    const struct request_sock *req,
+					    bool nocache);
 extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk,
 						   struct sock *newsk,
 						   const struct request_sock *req);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 07f5579ca756..3eb76b5f221a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -504,7 +504,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = inet_csk_route_req(sk, &fl4, req);
+	dst = inet_csk_route_req(sk, &fl4, req, false);
 	if (dst == NULL)
 		goto out;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f9ee7417f6a0..034ddbe42adf 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,17 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 
 struct dst_entry *inet_csk_route_req(struct sock *sk,
 				     struct flowi4 *fl4,
-				     const struct request_sock *req)
+				     const struct request_sock *req,
+				     bool nocache)
 {
 	struct rtable *rt;
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct ip_options_rcu *opt = inet_rsk(req)->opt;
 	struct net *net = sock_net(sk);
+	int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS;
 
+	if (nocache)
+		flags |= FLOWI_FLAG_RT_NOCACHE;
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol,
-			   inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS,
+			   flags,
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
 			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
 	security_req_classify_flow(req, flowi4_to_flowi(fl4));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a91f6d33804c..8d62d85e68dc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1156,7 +1156,7 @@ restart:
 	candp = NULL;
 	now = jiffies;
 
-	if (!rt_caching(dev_net(rt->dst.dev))) {
+	if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
 		/*
 		 * If we're not caching, just tell the caller we
 		 * were successful and don't touch the route.  The
@@ -2582,6 +2582,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
 
+	if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
+		rth->dst.flags |= DST_NOCACHE;
+
 	return rth;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 21e22a00481a..b52934f5334e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
 			      struct request_values *rvp,
-			      u16 queue_mapping)
+			      u16 queue_mapping,
+			      bool nocache)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct flowi4 fl4;
@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	struct sk_buff * skb;
 
 	/* First, grab a route. */
-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
 		return -1;
 
 	skb = tcp_make_synack(sk, dst, req, rvp);
@@ -855,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 			      struct request_values *rvp)
 {
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-	return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
+	return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 }
 
 /*
@@ -1388,7 +1389,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
+		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
 		    fl4.daddr == saddr &&
 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
 			inet_peer_refcheck(peer);
@@ -1424,7 +1425,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	if (tcp_v4_send_synack(sk, dst, req,
 			       (struct request_values *)&tmp_ext,
-			       skb_get_queue_mapping(skb)) ||
+			       skb_get_queue_mapping(skb),
+			       want_cookie) ||
 	    want_cookie)
 		goto drop_and_free;
 
-- 
cgit v1.2.3


From 7011d0851b80a1a229acfda37ce08aad903b12d1 Mon Sep 17 00:00:00 2001
From: Vijay Subramanian <subramanian.vijay@gmail.com>
Date: Sat, 23 Jun 2012 17:38:10 +0000
Subject: tcp: Fix bug in tcp socket early demux

The dest port for the call to __inet_lookup_established() in TCP early demux
code is passed with the wrong endian-ness. This causes the lookup to fail
leading to early demux not being used.

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b52934f5334e..1781dc650b9d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1701,7 +1701,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 	dev = skb->dev;
 	sk = __inet_lookup_established(net, &tcp_hashinfo,
 				       iph->saddr, th->source,
-				       iph->daddr, th->dest,
+				       iph->daddr, ntohs(th->dest),
 				       dev->ifindex);
 	if (sk) {
 		skb->sk = sk;
-- 
cgit v1.2.3


From df67e6c9a6ca59ca96bdd46a500ae9dd596f427c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 00:10:09 -0700
Subject: ipv4: Remove unnecessary code from rt_check_expire().

IPv4 routing cache entries no longer use dst->expires, because the
metrics, PMTU, and redirect information are stored in the inetpeer
cache.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8d62d85e68dc..846961c6cbe1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -870,34 +870,22 @@ static void rt_check_expire(void)
 		while ((rth = rcu_dereference_protected(*rthp,
 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 			prefetch(rth->dst.rt_next);
-			if (rt_is_expired(rth)) {
+			if (rt_is_expired(rth) ||
+			    rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 				*rthp = rth->dst.rt_next;
 				rt_free(rth);
 				continue;
 			}
-			if (rth->dst.expires) {
-				/* Entry is expired even if it is in use */
-				if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					/*
-					 * We only count entries on
-					 * a chain with equal hash inputs once
-					 * so that entries for different QOS
-					 * levels, and other non-hash input
-					 * attributes don't unfairly skew
-					 * the length computation
-					 */
-					length += has_noalias(rt_hash_table[i].chain, rth);
-					continue;
-				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
-				goto nofree;
 
-			/* Cleanup aged off entries. */
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
+			/* We only count entries on a chain with equal
+			 * hash inputs once so that entries for
+			 * different QOS levels, and other non-hash
+			 * input attributes don't unfairly skew the
+			 * length computation
+			 */
+			tmo >>= 1;
+			rthp = &rth->dst.rt_next;
+			length += has_noalias(rt_hash_table[i].chain, rth);
 		}
 		spin_unlock_bh(rt_hash_lock_addr(i));
 		sum += length;
-- 
cgit v1.2.3


From 251da4130115b29403a57096fa0988249f31fc55 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 16:27:09 -0700
Subject: ipv4: Cache ip_error() routes even when not forwarding.

And account for the fact that, when we are not forwarding, we should
bump statistic counters rather than emit an ICMP response.

RP-filter rejected lookups are still not cached.

Since -EHOSTUNREACH and -ENETUNREACH can now no longer be seen in
ip_rcv_finish(), remove those checks.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c |  8 +-------
 net/ipv4/route.c    | 30 +++++++++++++++++++-----------
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index bca25179cdb9..2a39204de5bc 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -342,13 +342,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
 						   iph->tos, skb->dev);
 			if (unlikely(err)) {
-				if (err == -EHOSTUNREACH)
-					IP_INC_STATS_BH(dev_net(skb->dev),
-							IPSTATS_MIB_INADDRERRORS);
-				else if (err == -ENETUNREACH)
-					IP_INC_STATS_BH(dev_net(skb->dev),
-							IPSTATS_MIB_INNOROUTES);
-				else if (err == -EXDEV)
+				if (err == -EXDEV)
 					NET_INC_STATS_BH(dev_net(skb->dev),
 							 LINUX_MIB_IPRPFILTER);
 				goto drop;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 846961c6cbe1..81533e3a23d1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1609,12 +1609,28 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 
 static int ip_error(struct sk_buff *skb)
 {
+	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 	struct rtable *rt = skb_rtable(skb);
 	struct inet_peer *peer;
 	unsigned long now;
+	struct net *net;
 	bool send;
 	int code;
 
+	net = dev_net(rt->dst.dev);
+	if (!IN_DEV_FORWARD(in_dev)) {
+		switch (rt->dst.error) {
+		case EHOSTUNREACH:
+			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+			break;
+
+		case ENETUNREACH:
+			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+			break;
+		}
+		goto out;
+	}
+
 	switch (rt->dst.error) {
 	case EINVAL:
 	default:
@@ -1624,8 +1640,7 @@ static int ip_error(struct sk_buff *skb)
 		break;
 	case ENETUNREACH:
 		code = ICMP_NET_UNREACH;
-		IP_INC_STATS_BH(dev_net(rt->dst.dev),
-				IPSTATS_MIB_INNOROUTES);
+		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 		break;
 	case EACCES:
 		code = ICMP_PKT_FILTERED;
@@ -2255,11 +2270,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
 	err = fib_lookup(net, &fl4, &res);
-	if (err != 0) {
-		if (!IN_DEV_FORWARD(in_dev))
-			goto e_hostunreach;
+	if (err != 0)
 		goto no_route;
-	}
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 
@@ -2279,7 +2291,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	}
 
 	if (!IN_DEV_FORWARD(in_dev))
-		goto e_hostunreach;
+		goto no_route;
 	if (res.type != RTN_UNICAST)
 		goto martian_destination;
 
@@ -2367,10 +2379,6 @@ martian_destination:
 				     &daddr, &saddr, dev->name);
 #endif
 
-e_hostunreach:
-	err = -EHOSTUNREACH;
-	goto out;
-
 e_inval:
 	err = -EINVAL;
 	goto out;
-- 
cgit v1.2.3


From d106352d9f527fe336749ad89de7e07e5af91a68 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 21:28:54 -0700
Subject: inet_diag: Move away from NLMSG_PUT().

And use nlmsg_data() while we're here too, and remove useless
casts.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 46d1e7199a8c..27640e734cfd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -87,10 +87,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	handler = inet_diag_table[req->sdiag_protocol];
 	BUG_ON(handler == NULL);
 
-	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 0);
+	if (!nlh) {
+		nlmsg_trim(skb, b);
+		return -EMSGSIZE;
+	}
 	nlh->nlmsg_flags = nlmsg_flags;
 
-	r = NLMSG_DATA(nlh);
+	r = nlmsg_data(nlh);
 	BUG_ON(sk->sk_state == TCP_TIME_WAIT);
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
@@ -186,7 +190,6 @@ out:
 	return skb->len;
 
 rtattr_failure:
-nlmsg_failure:
 	nlmsg_trim(skb, b);
 	return -EMSGSIZE;
 }
@@ -209,10 +212,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 	long tmo;
 	struct inet_diag_msg *r;
 	const unsigned char *previous_tail = skb_tail_pointer(skb);
-	struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
-					 unlh->nlmsg_type, sizeof(*r));
+	struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq,
+					 unlh->nlmsg_type, sizeof(*r), 0);
+
+	if (!nlh) {
+		nlmsg_trim(skb, previous_tail);
+		return -EMSGSIZE;
+	}
 
-	r = NLMSG_DATA(nlh);
+	r = nlmsg_data(nlh);
 	BUG_ON(tw->tw_state != TCP_TIME_WAIT);
 
 	nlh->nlmsg_flags = nlmsg_flags;
@@ -247,9 +255,6 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 #endif
 	nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
 	return skb->len;
-nlmsg_failure:
-	nlmsg_trim(skb, previous_tail);
-	return -EMSGSIZE;
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -597,9 +602,13 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	struct nlmsghdr *nlh;
 	long tmo;
 
-	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 0);
+	if (!nlh) {
+		nlmsg_trim(skb, b);
+		return -1;
+	}
 	nlh->nlmsg_flags = NLM_F_MULTI;
-	r = NLMSG_DATA(nlh);
+	r = nlmsg_data(nlh);
 
 	r->idiag_family = sk->sk_family;
 	r->idiag_state = TCP_SYN_RECV;
@@ -631,10 +640,6 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 
 	return skb->len;
-
-nlmsg_failure:
-	nlmsg_trim(skb, b);
-	return -1;
 }
 
 static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
@@ -892,7 +897,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nlmsg_attrlen(cb->nlh, hdrlen))
 		bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
 
-	return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc);
+	return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
 }
 
 static inline int inet_diag_type2proto(int type)
@@ -909,7 +914,7 @@ static inline int inet_diag_type2proto(int type)
 
 static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct inet_diag_req *rc = NLMSG_DATA(cb->nlh);
+	struct inet_diag_req *rc = nlmsg_data(cb->nlh);
 	struct inet_diag_req_v2 req;
 	struct nlattr *bc = NULL;
 	int hdrlen = sizeof(struct inet_diag_req);
@@ -929,7 +934,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
 static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
 			       const struct nlmsghdr *nlh)
 {
-	struct inet_diag_req *rc = NLMSG_DATA(nlh);
+	struct inet_diag_req *rc = nlmsg_data(nlh);
 	struct inet_diag_req_v2 req;
 
 	req.sdiag_family = rc->idiag_family;
@@ -996,7 +1001,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		}
 	}
 
-	return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
+	return inet_diag_get_exact(skb, h, nlmsg_data(h));
 }
 
 static const struct sock_diag_handler inet_diag_handler = {
-- 
cgit v1.2.3


From c2bd4baf410dafeba6aff8ca1cae94344551b0a3 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 21:30:49 -0700
Subject: netfilter: ipt_ULOG: Move away from NLMSG_PUT().

And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_ULOG.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d20165..99b3f53f16a7 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
 
 	pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
 
-	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
-	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
-			sizeof(*pm)+copy_len);
+	nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+			sizeof(*pm)+copy_len, 0);
+	if (!nlh) {
+		pr_debug("error during nlmsg_put\n");
+		goto out_unlock;
+	}
 	ub->qlen++;
 
-	pm = NLMSG_DATA(nlh);
+	pm = nlmsg_data(nlh);
 
 	/* We might not have a timestamp, get one */
 	if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
 			nlh->nlmsg_type = NLMSG_DONE;
 		ulog_send(groupnum);
 	}
-
+out_unlock:
 	spin_unlock_bh(&ulog_lock);
 
 	return;
 
-nlmsg_failure:
-	pr_debug("error during NLMSG_PUT\n");
 alloc_failure:
 	pr_debug("Error building netlink message\n");
 	spin_unlock_bh(&ulog_lock);
-- 
cgit v1.2.3


From f1caad274515ffd9841ac57ce9a7b5fc35bbf689 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 21 Jun 2012 04:36:39 +0000
Subject: netfilter: nf_conntrack: prepare l4proto->init_net cleanup

l4proto->init contain quite redundant code. We can simplify this
by adding a new parameter l3proto.

This patch prepares that code simplification.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   | 2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +-
 net/netfilter/nf_conntrack_proto.c             | 5 +++--
 net/netfilter/nf_conntrack_proto_dccp.c        | 2 +-
 net/netfilter/nf_conntrack_proto_generic.c     | 2 +-
 net/netfilter/nf_conntrack_proto_gre.c         | 2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        | 4 ++--
 net/netfilter/nf_conntrack_proto_tcp.c         | 4 ++--
 net/netfilter/nf_conntrack_proto_udp.c         | 4 ++--
 net/netfilter/nf_conntrack_proto_udplite.c     | 2 +-
 11 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 81c52b5205f2..5dd60f2d02a1 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -97,7 +97,7 @@ struct nf_conntrack_l4proto {
 #endif
 	int	*net_id;
 	/* Init l4proto pernet data */
-	int (*init_net)(struct net *net);
+	int (*init_net)(struct net *net, u_int16_t proto);
 
 	/* Protocol name */
 	const char *name;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 041923cb67ad..76f7a2f657fe 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -337,7 +337,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int icmp_init_net(struct net *net)
+static int icmp_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_icmp_net *in = icmp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)in;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 63ed0121836c..807ae09df0ca 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -333,7 +333,7 @@ static struct ctl_table icmpv6_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int icmpv6_init_net(struct net *net)
+static int icmpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_icmp_net *in = icmpv6_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)in;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 9bd88aa3c74f..6f4b6f3deee5 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -461,7 +461,7 @@ int nf_conntrack_l4proto_register(struct net *net,
 	int ret = 0;
 
 	if (l4proto->init_net) {
-		ret = l4proto->init_net(net);
+		ret = l4proto->init_net(net, l4proto->l3proto);
 		if (ret < 0)
 			return ret;
 	}
@@ -515,7 +515,8 @@ int nf_conntrack_proto_init(struct net *net)
 {
 	unsigned int i;
 	int err;
-	err = nf_conntrack_l4proto_generic.init_net(net);
+	err = nf_conntrack_l4proto_generic.init_net(net,
+					nf_conntrack_l4proto_generic.l3proto);
 	if (err < 0)
 		return err;
 	err = nf_ct_l4proto_register_sysctl(net,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index c33f76af913f..52da8f0293b5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -815,7 +815,7 @@ static struct ctl_table dccp_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int dccp_init_net(struct net *net)
+static int dccp_init_net(struct net *net, u_int16_t proto)
 {
 	struct dccp_net *dn = dccp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)dn;
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index bb0e74fe0fae..d1ed7b44e079 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -135,7 +135,7 @@ static struct ctl_table generic_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int generic_init_net(struct net *net)
+static int generic_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_generic_net *gn = generic_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)gn;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 5cac41c2fa09..b09b7af7f6f8 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -348,7 +348,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
 };
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
-static int gre_init_net(struct net *net)
+static int gre_init_net(struct net *net, u_int16_t proto)
 {
 	struct netns_proto_gre *net_gre = gre_pernet(net);
 	int i;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 8fb0582ad397..1e7836cead74 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -767,7 +767,7 @@ static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int sctpv4_init_net(struct net *net)
+static int sctpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int ret;
 	struct sctp_net *sn = sctp_pernet(net);
@@ -793,7 +793,7 @@ static int sctpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int sctpv6_init_net(struct net *net)
+static int sctpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct sctp_net *sn = sctp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)sn;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 99caa1304477..6db9d3c44820 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1593,7 +1593,7 @@ static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int tcpv4_init_net(struct net *net)
+static int tcpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	int ret = 0;
@@ -1631,7 +1631,7 @@ static int tcpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int tcpv6_init_net(struct net *net)
+static int tcpv6_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	struct nf_tcp_net *tn = tcp_pernet(net);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index a83cf93545cd..2b978e6fd1c2 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -283,7 +283,7 @@ static void udp_init_net_data(struct nf_udp_net *un)
 	}
 }
 
-static int udpv4_init_net(struct net *net)
+static int udpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int ret;
 	struct nf_udp_net *un = udp_pernet(net);
@@ -307,7 +307,7 @@ static int udpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int udpv6_init_net(struct net *net)
+static int udpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_udp_net *un = udp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)un;
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index b32e700f8dde..d33e51158039 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -234,7 +234,7 @@ static struct ctl_table udplite_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int udplite_init_net(struct net *net)
+static int udplite_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	struct udplite_net *un = udplite_pernet(net);
-- 
cgit v1.2.3


From a9082b45ad3c7284db974a108e7c1f1af7387d7b Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 21 Jun 2012 04:36:49 +0000
Subject: netfilter: nf_ct_icmp: add icmp_kmemdup[_compat]_sysctl_table
 function

Split sysctl function into smaller chucks to cleanup code and prepare
patches to reduce ifdef pollution.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 41 ++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 76f7a2f657fe..9c2095c5571f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -337,34 +337,57 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int icmp_init_net(struct net *net, u_int16_t proto)
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+				     struct nf_icmp_net *in)
 {
-	struct nf_icmp_net *in = icmp_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)in;
-	in->timeout = nf_ct_icmp_timeout;
-
 #ifdef CONFIG_SYSCTL
 	pn->ctl_table = kmemdup(icmp_sysctl_table,
 				sizeof(icmp_sysctl_table),
 				GFP_KERNEL);
 	if (!pn->ctl_table)
 		return -ENOMEM;
+
 	pn->ctl_table[0].data = &in->timeout;
+#endif
+	return 0;
+}
+
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+					    struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
 	pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
 				       sizeof(icmp_compat_sysctl_table),
 				       GFP_KERNEL);
-	if (!pn->ctl_compat_table) {
-		kfree(pn->ctl_table);
-		pn->ctl_table = NULL;
+	if (!pn->ctl_compat_table)
 		return -ENOMEM;
-	}
+
 	pn->ctl_compat_table[0].data = &in->timeout;
 #endif
 #endif
 	return 0;
 }
 
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+	int ret;
+	struct nf_icmp_net *in = icmp_pernet(net);
+	struct nf_proto_net *pn = &in->pn;
+
+	in->timeout = nf_ct_icmp_timeout;
+
+	ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+	if (ret < 0)
+		return ret;
+
+	ret = icmp_kmemdup_sysctl_table(pn, in);
+	if (ret < 0)
+		nf_ct_kfree_compat_sysctl_table(pn);
+
+	return ret;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
 	.l3proto		= PF_INET,
-- 
cgit v1.2.3


From c074da2810c118b3812f32d6754bd9ead2f169e7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jun 2012 23:14:15 +0000
Subject: ipv4: tcp: dont cache unconfirmed intput dst

DDOS synflood attacks hit badly IP route cache.

On typical machines, this cache is allowed to hold up to 8 Millions dst
entries, 256 bytes for each, for a total of 2GB of memory.

rt_garbage_collect() triggers and tries to cleanup things.

Eventually route cache is disabled but machine is under fire and might
OOM and crash.

This patch exploits the new TCP early demux, to set a nocache
boolean in case incoming TCP frame is for a not yet ESTABLISHED or
TIMEWAIT socket.

This 'nocache' boolean is then used in case dst entry is not found in
route cache, to create an unhashed dst entry (DST_NOCACHE)

SYN-cookie-ACK sent use a similar mechanism (ipv4: tcp: dont cache
output dst for syncookies), so after this patch, a machine is able to
absorb a DDOS synflood attack without polluting its IP route cache.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h | 2 +-
 include/net/route.h    | 8 ++++----
 include/net/tcp.h      | 2 +-
 net/ipv4/arp.c         | 2 +-
 net/ipv4/ip_fragment.c | 2 +-
 net/ipv4/ip_input.c    | 5 +++--
 net/ipv4/route.c       | 8 +++++---
 net/ipv4/tcp_ipv4.c    | 4 +++-
 net/ipv4/xfrm4_input.c | 2 +-
 9 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926cbfb1..7cfc8f76914d 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
+	int			(*early_demux)(struct sk_buff *skb, bool *nocache);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 47eb25ac1f7f..6361f9335774 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 }
 
 extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
+				 u8 tos, struct net_device *devin, bool noref, bool nocache);
 
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
+	return ip_route_input_common(skb, dst, src, tos, devin, false, false);
 }
 
 static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
+				       u8 tos, struct net_device *devin, bool nocache)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
+	return ip_route_input_common(skb, dst, src, tos, devin, true, nocache);
 }
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc4963d..917ed2e55e8c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb);
+extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757d..6a9795944369 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..978d55f256ea 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg)
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
 		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+					   iph->tos, head->dev, false);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2a39204de5bc..7be54c8dcbe2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -326,6 +326,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 */
 	if (skb_dst(skb) == NULL) {
 		int err = -ENOENT;
+		bool nocache = false;
 
 		if (sysctl_ip_early_demux) {
 			const struct net_protocol *ipprot;
@@ -334,13 +335,13 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			rcu_read_lock();
 			ipprot = rcu_dereference(inet_protos[protocol]);
 			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb);
+				err = ipprot->early_demux(skb, &nocache);
 			rcu_read_unlock();
 		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev);
+						   iph->tos, skb->dev, nocache);
 			if (unlikely(err)) {
 				if (err == -EXDEV)
 					NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 81533e3a23d1..fdc7900f9d7a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			       u8 tos, struct net_device *dev)
+			       u8 tos, struct net_device *dev, bool nocache)
 {
 	struct fib_result res;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2353,6 +2353,8 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+	if (nocache)
+		rth->dst.flags |= DST_NOCACHE;
 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 	err = 0;
@@ -2395,7 +2397,7 @@ martian_source_keep_err:
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+			   u8 tos, struct net_device *dev, bool noref, bool nocache)
 {
 	struct rtable	*rth;
 	unsigned int	hash;
@@ -2471,7 +2473,7 @@ skip_cache:
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1781dc650b9d..33aabd4fc20f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,7 +1673,7 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -1719,6 +1719,8 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 				}
 			}
 		}
+	} else {
+		*no_dst_cache = true;
 	}
 
 out_err:
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..eee636b191b9 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+					 iph->tos, skb->dev, false))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.3


From 6e277ed59a45544786f4c4643a69527138c24fc1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 26 Jun 2012 23:36:12 +0000
Subject: inet_diag: Do not use RTA_PUT() macros

Also, no need to trim on nlmsg_put() failure, nothing has been added
yet.  We also want to use nlmsg_end(), nlmsg_new() and nlmsg_free().

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 112 ++++++++++++++++++++++++---------------------------
 1 file changed, 53 insertions(+), 59 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 27640e734cfd..38064a285cca 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -46,9 +46,6 @@ struct inet_diag_entry {
 	u16 userlocks;
 };
 
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
-	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-
 static DEFINE_MUTEX(inet_diag_table_mutex);
 
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
@@ -78,28 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	const struct inet_sock *inet = inet_sk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr  *nlh;
+	struct nlattr *attr;
 	void *info = NULL;
-	struct inet_diag_meminfo  *minfo = NULL;
-	unsigned char	 *b = skb_tail_pointer(skb);
 	const struct inet_diag_handler *handler;
 	int ext = req->idiag_ext;
 
 	handler = inet_diag_table[req->sdiag_protocol];
 	BUG_ON(handler == NULL);
 
-	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 0);
-	if (!nlh) {
-		nlmsg_trim(skb, b);
+	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+			nlmsg_flags);
+	if (!nlh)
 		return -EMSGSIZE;
-	}
-	nlh->nlmsg_flags = nlmsg_flags;
 
 	r = nlmsg_data(nlh);
 	BUG_ON(sk->sk_state == TCP_TIME_WAIT);
 
-	if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
-		minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
-
 	r->idiag_family = sk->sk_family;
 	r->idiag_state = sk->sk_state;
 	r->idiag_timer = 0;
@@ -117,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	 * hence this needs to be included regardless of socket family.
 	 */
 	if (ext & (1 << (INET_DIAG_TOS - 1)))
-		RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos);
+		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+			goto errout;
 
 #if IS_ENABLED(CONFIG_IPV6)
 	if (r->idiag_family == AF_INET6) {
@@ -125,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 		*(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
 		*(struct in6_addr *)r->id.idiag_dst = np->daddr;
+
 		if (ext & (1 << (INET_DIAG_TCLASS - 1)))
-			RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass);
+			if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
+				goto errout;
 	}
 #endif
 
 	r->idiag_uid = sock_i_uid(sk);
 	r->idiag_inode = sock_i_ino(sk);
 
-	if (minfo) {
-		minfo->idiag_rmem = sk_rmem_alloc_get(sk);
-		minfo->idiag_wmem = sk->sk_wmem_queued;
-		minfo->idiag_fmem = sk->sk_forward_alloc;
-		minfo->idiag_tmem = sk_wmem_alloc_get(sk);
+	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+		struct inet_diag_meminfo minfo = {
+			.idiag_rmem = sk_rmem_alloc_get(sk),
+			.idiag_wmem = sk->sk_wmem_queued,
+			.idiag_fmem = sk->sk_forward_alloc,
+			.idiag_tmem = sk_wmem_alloc_get(sk),
+		};
+
+		if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+			goto errout;
 	}
 
 	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
 		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
-			goto rtattr_failure;
+			goto errout;
 
 	if (icsk == NULL) {
 		handler->idiag_get_info(sk, r, NULL);
@@ -169,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 #undef EXPIRES_IN_MS
 
-	if (ext & (1 << (INET_DIAG_INFO - 1)))
-		info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info));
+	if (ext & (1 << (INET_DIAG_INFO - 1))) {
+		attr = nla_reserve(skb, INET_DIAG_INFO,
+				   sizeof(struct tcp_info));
+		if (!attr)
+			goto errout;
 
-	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
-		const size_t len = strlen(icsk->icsk_ca_ops->name);
-
-		strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
-		       icsk->icsk_ca_ops->name);
+		info = nla_data(attr);
 	}
 
+	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
+		if (nla_put_string(skb, INET_DIAG_CONG,
+				   icsk->icsk_ca_ops->name) < 0)
+			goto errout;
+
 	handler->idiag_get_info(sk, r, info);
 
 	if (sk->sk_state < TCP_TIME_WAIT &&
@@ -186,11 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		icsk->icsk_ca_ops->get_info(sk, ext, skb);
 
 out:
-	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-	return skb->len;
+	return nlmsg_end(skb, nlh);
 
-rtattr_failure:
-	nlmsg_trim(skb, b);
+errout:
+	nlmsg_cancel(skb, nlh);
 	return -EMSGSIZE;
 }
 EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
@@ -211,20 +213,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 {
 	long tmo;
 	struct inet_diag_msg *r;
-	const unsigned char *previous_tail = skb_tail_pointer(skb);
-	struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq,
-					 unlh->nlmsg_type, sizeof(*r), 0);
+	struct nlmsghdr *nlh;
 
-	if (!nlh) {
-		nlmsg_trim(skb, previous_tail);
+	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+			nlmsg_flags);
+	if (!nlh)
 		return -EMSGSIZE;
-	}
 
 	r = nlmsg_data(nlh);
 	BUG_ON(tw->tw_state != TCP_TIME_WAIT);
 
-	nlh->nlmsg_flags = nlmsg_flags;
-
 	tmo = tw->tw_ttd - jiffies;
 	if (tmo < 0)
 		tmo = 0;
@@ -253,8 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 		*(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
 	}
 #endif
-	nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
-	return skb->len;
+
+	return nlmsg_end(skb, nlh);
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -303,20 +301,20 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 	if (err)
 		goto out;
 
-	err = -ENOMEM;
-	rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
-				     sizeof(struct inet_diag_meminfo) +
-				     sizeof(struct tcp_info) + 64)),
-			GFP_KERNEL);
-	if (!rep)
+	rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+			sizeof(struct inet_diag_meminfo) +
+			sizeof(struct tcp_info) + 64, GFP_KERNEL);
+	if (!rep) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	err = sk_diag_fill(sk, rep, req,
 			   NETLINK_CB(in_skb).pid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
-		kfree_skb(rep);
+		nlmsg_free(rep);
 		goto out;
 	}
 	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
@@ -597,19 +595,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct inet_sock *inet = inet_sk(sk);
-	unsigned char *b = skb_tail_pointer(skb);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
 
-	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 0);
-	if (!nlh) {
-		nlmsg_trim(skb, b);
-		return -1;
-	}
-	nlh->nlmsg_flags = NLM_F_MULTI;
-	r = nlmsg_data(nlh);
+	nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
+			NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
 
+	r = nlmsg_data(nlh);
 	r->idiag_family = sk->sk_family;
 	r->idiag_state = TCP_SYN_RECV;
 	r->idiag_timer = 1;
@@ -637,9 +632,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 		*(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
 	}
 #endif
-	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 
-	return skb->len;
+	return nlmsg_end(skb, nlh);
 }
 
 static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
-- 
cgit v1.2.3


From 92a395e52f0c97f38ff675d7c698404bfc812dc2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 26 Jun 2012 23:36:13 +0000
Subject: ipmr: Do not use RTA_PUT() macros

Also fix a needless skb tailroom check for a 4 bytes area
after after each rtnexthop block.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c94bbc6f2ba3..b4ac39f11d19 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2006,37 +2006,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 {
 	int ct;
 	struct rtnexthop *nhp;
-	u8 *b = skb_tail_pointer(skb);
-	struct rtattr *mp_head;
+	struct nlattr *mp_attr;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
 	if (c->mfc_parent >= MAXVIFS)
 		return -ENOENT;
 
-	if (VIF_EXISTS(mrt, c->mfc_parent))
-		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
+	if (VIF_EXISTS(mrt, c->mfc_parent) &&
+	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+		return -EMSGSIZE;
 
-	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+		return -EMSGSIZE;
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
 		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
-				goto rtattr_failure;
-			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+				nla_nest_cancel(skb, mp_attr);
+				return -EMSGSIZE;
+			}
+
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
 			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
-	mp_head->rta_type = RTA_MULTIPATH;
-	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+
+	nla_nest_end(skb, mp_attr);
+
 	rtm->rtm_type = RTN_MULTICAST;
 	return 1;
-
-rtattr_failure:
-	nlmsg_trim(skb, b);
-	return -EMSGSIZE;
 }
 
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
-- 
cgit v1.2.3


From 22911fc581f6a241e2897a7a8603e97344a6ec82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Jun 2012 00:23:44 +0000
Subject: net: skb_free_datagram_locked() doesnt drop all packets

dropwatch wrongly diagnose all received UDP packets as drops.

This patch removes trace_kfree_skb() done in skb_free_datagram_locked().

Locations calling skb_free_datagram_locked() should do it on their own.

As a result, drops are accounted on the right function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c  |  1 -
 net/ipv4/udp.c       |  5 ++++-
 net/ipv6/udp.c       |  8 +++++---
 net/sunrpc/svcsock.c | 12 ++++++------
 4 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index ae6acf6a3dea..0337e2b76862 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -248,7 +248,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	unlock_sock_fast(sk, slow);
 
 	/* skb is now orphaned, can be freed outside of locked section */
-	trace_kfree_skb(skb, skb_free_datagram_locked);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(skb_free_datagram_locked);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db017efb76ea..ee37d47d472e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,6 +108,7 @@
 #include <net/xfrm.h>
 #include <trace/events/udp.h>
 #include <linux/static_key.h>
+#include <trace/events/skb.h>
 #include "udp_impl.h"
 
 struct udp_table udp_table __read_mostly;
@@ -1220,8 +1221,10 @@ try_again:
 			goto csum_copy_err;
 	}
 
-	if (err)
+	if (unlikely(err)) {
+		trace_kfree_skb(skb, udp_recvmsg);
 		goto out_free;
+	}
 
 	if (!peeked)
 		UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 051ad481973f..1ecd10249488 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -48,6 +48,7 @@
 
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <trace/events/skb.h>
 #include "udp_impl.h"
 
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
@@ -385,15 +386,16 @@ try_again:
 
 	if (skb_csum_unnecessary(skb))
 		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied       );
+					      msg->msg_iov, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
-	if (err)
+	if (unlikely(err)) {
+		trace_kfree_skb(skb, udpv6_recvmsg);
 		goto out_free;
-
+	}
 	if (!peeked) {
 		if (is_udp4)
 			UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a6de09de5d21..18bc130255a7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -43,6 +43,7 @@
 #include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <trace/events/skb.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/clnt.h>
@@ -619,6 +620,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	if (!svc_udp_get_dest_address(rqstp, cmh)) {
 		net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
 				     cmh->cmsg_level, cmh->cmsg_type);
+out_free:
+		trace_kfree_skb(skb, svc_udp_recvfrom);
 		skb_free_datagram_locked(svsk->sk_sk, skb);
 		return 0;
 	}
@@ -630,8 +633,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
 			local_bh_enable();
 			/* checksum error */
-			skb_free_datagram_locked(svsk->sk_sk, skb);
-			return 0;
+			goto out_free;
 		}
 		local_bh_enable();
 		skb_free_datagram_locked(svsk->sk_sk, skb);
@@ -640,10 +642,8 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		rqstp->rq_arg.head[0].iov_base = skb->data +
 			sizeof(struct udphdr);
 		rqstp->rq_arg.head[0].iov_len = len;
-		if (skb_checksum_complete(skb)) {
-			skb_free_datagram_locked(svsk->sk_sk, skb);
-			return 0;
-		}
+		if (skb_checksum_complete(skb))
+			goto out_free;
 		rqstp->rq_xprt_ctxt = skb;
 	}
 
-- 
cgit v1.2.3


From c10237e077cef50e925f052e49f3b4fead9d71f9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 27 Jun 2012 17:05:06 -0700
Subject: Revert "ipv4: tcp: dont cache unconfirmed intput dst"

This reverts commit c074da2810c118b3812f32d6754bd9ead2f169e7.

This change has several unwanted side effects:

1) Sockets will cache the DST_NOCACHE route in sk->sk_rx_dst and we'll
   thus never create a real cached route.

2) All TCP traffic will use DST_NOCACHE and never use the routing
   cache at all.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h | 2 +-
 include/net/route.h    | 8 ++++----
 include/net/tcp.h      | 2 +-
 net/ipv4/arp.c         | 2 +-
 net/ipv4/ip_fragment.c | 2 +-
 net/ipv4/ip_input.c    | 5 ++---
 net/ipv4/route.c       | 8 +++-----
 net/ipv4/tcp_ipv4.c    | 4 +---
 net/ipv4/xfrm4_input.c | 2 +-
 9 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 7cfc8f76914d..967b926cbfb1 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb, bool *nocache);
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 6361f9335774..47eb25ac1f7f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 }
 
 extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref, bool nocache);
+				 u8 tos, struct net_device *devin, bool noref);
 
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, false, false);
+	return ip_route_input_common(skb, dst, src, tos, devin, false);
 }
 
 static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin, bool nocache)
+				       u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, true, nocache);
+	return ip_route_input_common(skb, dst, src, tos, devin, true);
 }
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 917ed2e55e8c..6660ffc4963d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache);
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6a9795944369..2e560f0c757d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 978d55f256ea..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg)
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
 		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev, false);
+					   iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7be54c8dcbe2..2a39204de5bc 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -326,7 +326,6 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 */
 	if (skb_dst(skb) == NULL) {
 		int err = -ENOENT;
-		bool nocache = false;
 
 		if (sysctl_ip_early_demux) {
 			const struct net_protocol *ipprot;
@@ -335,13 +334,13 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			rcu_read_lock();
 			ipprot = rcu_dereference(inet_protos[protocol]);
 			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb, &nocache);
+				err = ipprot->early_demux(skb);
 			rcu_read_unlock();
 		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev, nocache);
+						   iph->tos, skb->dev);
 			if (unlikely(err)) {
 				if (err == -EXDEV)
 					NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fdc7900f9d7a..81533e3a23d1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			       u8 tos, struct net_device *dev, bool nocache)
+			       u8 tos, struct net_device *dev)
 {
 	struct fib_result res;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2353,8 +2353,6 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	if (nocache)
-		rth->dst.flags |= DST_NOCACHE;
 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 	err = 0;
@@ -2397,7 +2395,7 @@ martian_source_keep_err:
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref, bool nocache)
+			   u8 tos, struct net_device *dev, bool noref)
 {
 	struct rtable	*rth;
 	unsigned int	hash;
@@ -2473,7 +2471,7 @@ skip_cache:
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache);
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 33aabd4fc20f..1781dc650b9d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,7 +1673,7 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
+int tcp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -1719,8 +1719,6 @@ int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
 				}
 			}
 		}
-	} else {
-		*no_dst_cache = true;
 	}
 
 out_err:
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index eee636b191b9..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev, false))
+					 iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.3


From 160eb5a6b14ca2eab5c598bdbbb24c24624bad34 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 27 Jun 2012 22:01:22 -0700
Subject: ipv4: Kill early demux method return value.

It's completely unnecessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |  2 +-
 include/net/tcp.h      |  2 +-
 net/ipv4/ip_input.c    | 42 +++++++++++++++++++-----------------------
 net/ipv4/tcp_ipv4.c    | 19 ++++++-------------
 4 files changed, 27 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926cbfb1..057f2d315567 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
+	void			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc4963d..53fb7d814170 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb);
+extern void tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2a39204de5bc..b27d4440f523 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -320,33 +320,29 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
+	if (sysctl_ip_early_demux && !skb_dst(skb)) {
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[protocol]);
+		if (ipprot && ipprot->early_demux)
+			ipprot->early_demux(skb);
+		rcu_read_unlock();
+	}
+
 	/*
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */
-	if (skb_dst(skb) == NULL) {
-		int err = -ENOENT;
-
-		if (sysctl_ip_early_demux) {
-			const struct net_protocol *ipprot;
-			int protocol = iph->protocol;
-
-			rcu_read_lock();
-			ipprot = rcu_dereference(inet_protos[protocol]);
-			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb);
-			rcu_read_unlock();
-		}
-
-		if (err) {
-			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev);
-			if (unlikely(err)) {
-				if (err == -EXDEV)
-					NET_INC_STATS_BH(dev_net(skb->dev),
-							 LINUX_MIB_IPRPFILTER);
-				goto drop;
-			}
+	if (!skb_dst(skb)) {
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
+		if (unlikely(err)) {
+			if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
+			goto drop;
 		}
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1781dc650b9d..b4ae1c199f3e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,30 +1673,28 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb)
+void tcp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	struct net_device *dev;
 	struct sock *sk;
-	int err;
 
-	err = -ENOENT;
 	if (skb->pkt_type != PACKET_HOST)
-		goto out_err;
+		return;
 
 	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
-		goto out_err;
+		return;
 
 	iph = ip_hdr(skb);
 	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
 
 	if (th->doff < sizeof(struct tcphdr) / 4)
-		goto out_err;
+		return;
 
 	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
-		goto out_err;
+		return;
 
 	dev = skb->dev;
 	sk = __inet_lookup_established(net, &tcp_hashinfo,
@@ -1713,16 +1711,11 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 			if (dst) {
 				struct rtable *rt = (struct rtable *) dst;
 
-				if (rt->rt_iif == dev->ifindex) {
+				if (rt->rt_iif == dev->ifindex)
 					skb_dst_set_noref(skb, dst);
-					err = 0;
-				}
 			}
 		}
 	}
-
-out_err:
-	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 70e7341673a47fb1525cfc7d6651cc98b5348928 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 03:21:41 -0700
Subject: ipv4: Show that ip_send_reply() is purely unicast routine.

Rename it to ip_send_unicast_reply() and add explicit 'saddr'
argument.

This removed one of the few users of rt->rt_spec_dst.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h     | 5 +++--
 net/ipv4/ip_output.c | 9 +++++----
 net/ipv4/tcp_ipv4.c  | 8 ++++----
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 50841bd6f10e..ec5cfde85e9a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -158,8 +158,9 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-		   const struct ip_reply_arg *arg, unsigned int len);
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+			   __be32 saddr, const struct ip_reply_arg *arg,
+			   unsigned int len);
 
 struct ipv4_config {
 	int	log_martians;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0f3185a662c3..2630900e480a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1459,13 +1459,14 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 
 /*
  *	Generic function to send a packet as reply to another packet.
- *	Used to send TCP resets so far. ICMP should use this function too.
+ *	Used to send TCP resets so far.
  *
  *	Should run single threaded per socket because it uses the sock
  *     	structure to pass arguments.
  */
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-		   const struct ip_reply_arg *arg, unsigned int len)
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+			   __be32 saddr, const struct ip_reply_arg *arg,
+			   unsigned int len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_options_data replyopts;
@@ -1491,7 +1492,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
 			   ip_reply_arg_flowi_flags(arg),
-			   daddr, rt->rt_spec_dst,
+			   daddr, saddr,
 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b4ae1c199f3e..64568fa21d05 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 
 	net = dev_net(skb_dst(skb)->dev);
 	arg.tos = ip_hdr(skb)->tos;
-	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-		      &arg, arg.iov[0].iov_len);
+	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
-	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-		      &arg, arg.iov[0].iov_len);
+	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 }
-- 
cgit v1.2.3


From 35ebf65e851c6d9731abc6362b189858eb59f4d3 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 03:59:11 -0700
Subject: ipv4: Create and use fib_compute_spec_dst() helper.

The specific destination is the host we direct unicast replies to.
Usually this is the original packet source address, but if we are
responding to a multicast or broadcast packet we have to use something
different.

Specifically we must use the source address we would use if we were to
send a packet to the unicast source of the original packet.

The routing cache precomputes this value, but we want to remove that
precomputation because it creates a hard dependency on the expensive
rpfilter source address validation which we'd like to make cheaper.

There are only three places where this matters:

1) ICMP replies.

2) pktinfo CMSG

3) IP options

Now there will be no real users of rt->rt_spec_dst and we can simply
remove it altogether.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  1 +
 net/ipv4/fib_frontend.c | 29 +++++++++++++++++++++++++++++
 net/ipv4/icmp.c         |  6 ++++--
 net/ipv4/ip_options.c   | 22 +++++++++++-----------
 net/ipv4/ip_sockglue.c  |  7 ++++---
 5 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4b347c0ca094..1687b3de54ff 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -230,6 +230,7 @@ extern struct fib_table *fib_get_table(struct net *net, u32 id);
 /* Exported by fib_frontend.c */
 extern const struct nla_policy rtm_ipv4_policy[];
 extern void		ip_fib_init(void);
+extern __be32 fib_compute_spec_dst(struct sk_buff *skb);
 extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       u8 tos, int oif, struct net_device *dev,
 			       __be32 *spec_dst, u32 *itag);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411fa37c..451939b60c54 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -180,6 +180,35 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct fib_result res;
+	struct flowi4 fl4;
+	struct net *net;
+
+	if (skb->pkt_type != PACKET_BROADCAST &&
+	    skb->pkt_type != PACKET_MULTICAST)
+		return ip_hdr(skb)->daddr;
+
+	in_dev = __in_dev_get_rcu(dev);
+	BUG_ON(!in_dev);
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = 0;
+	fl4.daddr = ip_hdr(skb)->saddr;
+	fl4.saddr = ip_hdr(skb)->daddr;
+	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+
+	net = dev_net(dev);
+	if (!fib_lookup(net, &fl4, &res))
+		return FIB_RES_PREFSRC(net, res);
+	else
+		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+}
+
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 49a74cc79dc8..4bce5a2830aa 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
+#include <net/ip_fib.h>
 
 /*
  *	Build xmit assembly blocks
@@ -333,7 +334,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct flowi4 fl4;
 	struct sock *sk;
 	struct inet_sock *inet;
-	__be32 daddr;
+	__be32 daddr, saddr;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -347,6 +348,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 
 	inet->tos = ip_hdr(skb)->tos;
 	daddr = ipc.addr = ip_hdr(skb)->saddr;
+	saddr = fib_compute_spec_dst(skb);
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
 	if (icmp_param->replyopts.opt.opt.optlen) {
@@ -356,7 +358,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	}
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
-	fl4.saddr = rt->rt_spec_dst;
+	fl4.saddr = saddr;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 708b99494e23..766dfe56885a 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
 
 /*
  * Write options to IP header, record destination address to
@@ -104,7 +105,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
 
-	daddr = skb_rtable(skb)->rt_spec_dst;
+	daddr = fib_compute_spec_dst(skb);
 
 	if (sopt->rr) {
 		optlen  = sptr[sopt->rr+1];
@@ -250,15 +251,14 @@ void ip_options_fragment(struct sk_buff *skb)
 int ip_options_compile(struct net *net,
 		       struct ip_options *opt, struct sk_buff *skb)
 {
-	int l;
-	unsigned char *iph;
-	unsigned char *optptr;
-	int optlen;
+	__be32 spec_dst = (__force __be32) 0;
 	unsigned char *pp_ptr = NULL;
-	struct rtable *rt = NULL;
+	unsigned char *optptr;
+	unsigned char *iph;
+	int optlen, l;
 
 	if (skb != NULL) {
-		rt = skb_rtable(skb);
+		spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -330,8 +330,8 @@ int ip_options_compile(struct net *net,
 					pp_ptr = optptr + 2;
 					goto error;
 				}
-				if (rt) {
-					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+				if (skb) {
+					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
 				optptr[2] += 4;
@@ -372,8 +372,8 @@ int ip_options_compile(struct net *net,
 						goto error;
 					}
 					opt->ts = optptr - iph;
-					if (rt)  {
-						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+					if (skb)  {
+						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}
 					opt->ts_needaddr = 1;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0d11f234d615..de29f46f68b0 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/transp_v6.h>
 #endif
+#include <net/ip_fib.h>
 
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -1019,8 +1020,8 @@ e_inval:
  * @sk: socket
  * @skb: buffer
  *
- * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
- * in skb->cb[] before dst drop.
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
  * This way, receiver doesnt make cache line misses to read rtable.
  */
 void ipv4_pktinfo_prepare(struct sk_buff *skb)
@@ -1030,7 +1031,7 @@ void ipv4_pktinfo_prepare(struct sk_buff *skb)
 
 	if (rt) {
 		pktinfo->ipi_ifindex = rt->rt_iif;
-		pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
+		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
 		pktinfo->ipi_ifindex = 0;
 		pktinfo->ipi_spec_dst.s_addr = 0;
-- 
cgit v1.2.3


From 41347dcdd81988b8e60853257b2875285cc17a4e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 04:05:27 -0700
Subject: ipv4: Kill rt->rt_spec_dst, no longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 +-
 include/net/route.h     |  1 -
 net/ipv4/fib_frontend.c |  9 ++-------
 net/ipv4/route.c        | 38 +++++++++-----------------------------
 net/ipv4/xfrm4_policy.c |  1 -
 5 files changed, 12 insertions(+), 39 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1687b3de54ff..9e6c26d4ba4c 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -233,7 +233,7 @@ extern void		ip_fib_init(void);
 extern __be32 fib_compute_spec_dst(struct sk_buff *skb);
 extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       u8 tos, int oif, struct net_device *dev,
-			       __be32 *spec_dst, u32 *itag);
+			       u32 *itag);
 extern void fib_select_default(struct fib_result *res);
 
 /* Exported by fib_semantics.c */
diff --git a/include/net/route.h b/include/net/route.h
index 47eb25ac1f7f..211e2665139b 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -65,7 +65,6 @@ struct rtable {
 	__be32			rt_gateway;
 
 	/* Miscellaneous cached information */
-	__be32			rt_spec_dst; /* RFC1122 specific destination */
 	u32			rt_peer_genid;
 	unsigned long		_peer; /* long-living peer info */
 	struct fib_info		*fi; /* for client ref to shared metrics */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 451939b60c54..63b11ca54d95 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -218,8 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
  * called with rcu_read_lock()
  */
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
-			int oif, struct net_device *dev, __be32 *spec_dst,
-			u32 *itag)
+			int oif, struct net_device *dev, u32 *itag)
 {
 	struct in_device *in_dev;
 	struct flowi4 fl4;
@@ -258,7 +257,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
 		if (res.type != RTN_LOCAL || !accept_local)
 			goto e_inval;
 	}
-	*spec_dst = FIB_RES_PREFSRC(net, res);
 	fib_combine_itag(itag, &res);
 	dev_match = false;
 
@@ -287,17 +285,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
 
 	ret = 0;
 	if (fib_lookup(net, &fl4, &res) == 0) {
-		if (res.type == RTN_UNICAST) {
-			*spec_dst = FIB_RES_PREFSRC(net, res);
+		if (res.type == RTN_UNICAST)
 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-		}
 	}
 	return ret;
 
 last_resort:
 	if (rpf)
 		goto e_rpf;
-	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	*itag = 0;
 	return 0;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 81533e3a23d1..83d56a016625 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -440,7 +440,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			r->rt_key_tos,
 			-1,
 			HHUptod,
-			r->rt_spec_dst, &len);
+			0, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
 	}
@@ -1978,7 +1978,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
 	unsigned int hash;
 	struct rtable *rth;
-	__be32 spec_dst;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
 	int err;
@@ -1999,10 +1998,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_zeronet(saddr)) {
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
-		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 	} else {
-		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-					  &itag);
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag);
 		if (err < 0)
 			goto e_err;
 	}
@@ -2029,7 +2026,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_oif	= 0;
 	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
-	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, dev_net(dev)->ipv4.peers);
 	rth->fi = NULL;
@@ -2093,7 +2089,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	int err;
 	struct in_device *out_dev;
 	unsigned int flags = 0;
-	__be32 spec_dst;
 	u32 itag;
 
 	/* get a working reference to the output device */
@@ -2105,7 +2100,7 @@ static int __mkroute_input(struct sk_buff *skb,
 
 
 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
-				  in_dev->dev, &spec_dst, &itag);
+				  in_dev->dev, &itag);
 	if (err < 0) {
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
@@ -2157,7 +2152,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_oif 	= 0;
 	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
-	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, &res->table->tb_peers);
 	rth->fi = NULL;
@@ -2223,7 +2217,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	u32		itag = 0;
 	struct rtable	*rth;
 	unsigned int	hash;
-	__be32		spec_dst;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
 
@@ -2281,12 +2274,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res.type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
 					  net->loopback_dev->ifindex,
-					  dev, &spec_dst, &itag);
+					  dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
 		if (err)
 			flags |= RTCF_DIRECTSRC;
-		spec_dst = daddr;
 		goto local_input;
 	}
 
@@ -2302,11 +2294,8 @@ brd_input:
 	if (skb->protocol != htons(ETH_P_IP))
 		goto e_inval;
 
-	if (ipv4_is_zeronet(saddr))
-		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-	else {
-		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-					  &itag);
+	if (!ipv4_is_zeronet(saddr)) {
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
 		if (err)
@@ -2344,7 +2333,6 @@ local_input:
 	rth->rt_oif	= 0;
 	rth->rt_mark    = skb->mark;
 	rth->rt_gateway	= daddr;
-	rth->rt_spec_dst= spec_dst;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, net->ipv4.peers);
 	rth->fi = NULL;
@@ -2362,7 +2350,6 @@ local_input:
 
 no_route:
 	RT_CACHE_STAT_INC(in_no_route);
-	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	res.type = RTN_UNREACHABLE;
 	if (err == -ESRCH)
 		err = -ENETUNREACH;
@@ -2545,7 +2532,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_oif	= orig_oif;
 	rth->rt_mark    = fl4->flowi4_mark;
 	rth->rt_gateway = fl4->daddr;
-	rth->rt_spec_dst= fl4->saddr;
 	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, (res->table ?
 			   &res->table->tb_peers :
@@ -2554,12 +2540,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
-	if (flags & RTCF_LOCAL) {
+	if (flags & RTCF_LOCAL)
 		rth->dst.input = ip_local_deliver;
-		rth->rt_spec_dst = fl4->daddr;
-	}
 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
-		rth->rt_spec_dst = fl4->saddr;
 		if (flags & RTCF_LOCAL &&
 		    !(dev_out->flags & IFF_LOOPBACK)) {
 			rth->dst.output = ip_mc_output;
@@ -2890,7 +2873,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_dst = ort->rt_dst;
 		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
-		rt->rt_spec_dst = ort->rt_spec_dst;
 		rt_transfer_peer(rt, ort);
 		rt->fi = ort->fi;
 		if (rt->fi)
@@ -2965,10 +2947,8 @@ static int rt_fill_info(struct net *net,
 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
 		goto nla_put_failure;
 #endif
-	if (rt_is_input_route(rt)) {
-		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
-			goto nla_put_failure;
-	} else if (rt->rt_src != rt->rt_key_src) {
+	if (!rt_is_input_route(rt) &&
+	    rt->rt_src != rt->rt_key_src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8855d8268552..9815ea0bca7f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,7 +100,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_src = rt->rt_src;
 	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
-	xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
 
 	return 0;
 }
-- 
cgit v1.2.3


From a207a4b2e8067cbc7f33924e7f2c0fa4ef43b459 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 18:33:24 -0700
Subject: ipv4: Fix bugs in fib_compute_spec_dst().

Based upon feedback from Julian Anastasov.

1) Use route flags to determine multicast/broadcast, not the
   packet flags.

2) Leave saddr unspecified in flow key.

3) Adjust how we invoke inet_select_addr().  Pass ip_hdr(skb)->saddr as
   second arg, and if it was zeronet use link scope.

4) Use loopback as input interface in flow key.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 63b11ca54d95..1d13217e01ff 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -185,28 +185,36 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
 	struct fib_result res;
+	struct rtable *rt;
 	struct flowi4 fl4;
 	struct net *net;
+	int scope;
 
-	if (skb->pkt_type != PACKET_BROADCAST &&
-	    skb->pkt_type != PACKET_MULTICAST)
+	rt = skb_rtable(skb);
+	if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)))
 		return ip_hdr(skb)->daddr;
 
 	in_dev = __in_dev_get_rcu(dev);
 	BUG_ON(!in_dev);
-	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = 0;
-	fl4.daddr = ip_hdr(skb)->saddr;
-	fl4.saddr = ip_hdr(skb)->daddr;
-	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-	fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
 
 	net = dev_net(dev);
-	if (!fib_lookup(net, &fl4, &res))
-		return FIB_RES_PREFSRC(net, res);
-	else
-		return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+
+	scope = RT_SCOPE_UNIVERSE;
+	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+		fl4.flowi4_oif = 0;
+		fl4.flowi4_iif = net->loopback_dev->ifindex;
+		fl4.daddr = ip_hdr(skb)->saddr;
+		fl4.saddr = 0;
+		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+		fl4.flowi4_scope = scope;
+		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+		if (!fib_lookup(net, &fl4, &res))
+			return FIB_RES_PREFSRC(net, res);
+	} else {
+		scope = RT_SCOPE_LINK;
+	}
+
+	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-- 
cgit v1.2.3


From 9e56e3800ea42e78b7c816bdd2d87d047be80541 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 18:54:02 -0700
Subject: ipv4: Adjust in_dev handling in fib_validate_source()

Checking for in_dev being NULL is pointless.

In fact, all of our callers have in_dev precomputed already,
so just pass it in and remove the NULL checking.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c | 27 ++++++++++-----------------
 net/ipv4/route.c        | 10 ++++++----
 3 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9e6c26d4ba4c..619f68a7185c 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -233,7 +233,7 @@ extern void		ip_fib_init(void);
 extern __be32 fib_compute_spec_dst(struct sk_buff *skb);
 extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       u8 tos, int oif, struct net_device *dev,
-			       u32 *itag);
+			       struct in_device *idev, u32 *itag);
 extern void fib_select_default(struct fib_result *res);
 
 /* Exported by fib_semantics.c */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d13217e01ff..c84cff52021e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -226,15 +226,14 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
  * called with rcu_read_lock()
  */
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
-			int oif, struct net_device *dev, u32 *itag)
+			int oif, struct net_device *dev, struct in_device *idev,
+			u32 *itag)
 {
-	struct in_device *in_dev;
-	struct flowi4 fl4;
+	int ret, no_addr, rpf, accept_local;
 	struct fib_result res;
-	int no_addr, rpf, accept_local;
-	bool dev_match;
-	int ret;
+	struct flowi4 fl4;
 	struct net *net;
+	bool dev_match;
 
 	fl4.flowi4_oif = 0;
 	fl4.flowi4_iif = oif;
@@ -244,19 +243,13 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
 	no_addr = rpf = accept_local = 0;
-	in_dev = __in_dev_get_rcu(dev);
-	if (in_dev) {
-		no_addr = in_dev->ifa_list == NULL;
-
-		/* Ignore rp_filter for packets protected by IPsec. */
-		rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
+	no_addr = idev->ifa_list == NULL;
 
-		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
-		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-	}
+	/* Ignore rp_filter for packets protected by IPsec. */
+	rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 
-	if (in_dev == NULL)
-		goto e_inval;
+	accept_local = IN_DEV_ACCEPT_LOCAL(idev);
+	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 
 	net = dev_net(dev);
 	if (fib_lookup(net, &fl4, &res))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 83d56a016625..919d69e60bab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1999,7 +1999,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
 	} else {
-		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag);
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+					  in_dev, &itag);
 		if (err < 0)
 			goto e_err;
 	}
@@ -2100,7 +2101,7 @@ static int __mkroute_input(struct sk_buff *skb,
 
 
 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
-				  in_dev->dev, &itag);
+				  in_dev->dev, in_dev, &itag);
 	if (err < 0) {
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
@@ -2274,7 +2275,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res.type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
 					  net->loopback_dev->ifindex,
-					  dev, &itag);
+					  dev, in_dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
 		if (err)
@@ -2295,7 +2296,8 @@ brd_input:
 		goto e_inval;
 
 	if (!ipv4_is_zeronet(saddr)) {
-		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag);
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+					  in_dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
 		if (err)
-- 
cgit v1.2.3


From 3085a4b7d33eb3111244173d1383256e94d249a5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Jun 2012 22:17:39 -0700
Subject: ipv4: Remove extraneous assignment of dst->tclassid.

We already set it several lines above.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 919d69e60bab..6a5afc715558 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2327,9 +2327,6 @@ local_input:
 	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-	rth->dst.tclassid = itag;
-#endif
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-- 
cgit v1.2.3


From 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 29 Jun 2012 01:32:45 -0700
Subject: ipv4: Elide fib_validate_source() completely when possible.

If rpfilter is off (or the SKB has an IPSEC path) and there are not
tclassid users, we don't have to do anything at all when
fib_validate_source() is invoked besides setting the itag to zero.

We monitor tclassid uses with a counter (modified only under RTNL and
marked __read_mostly) and we protect the fib_validate_source() real
work with a test against this counter and whether rpfilter is to be
done.

Having a way to know whether we need no tclassid processing or not
also opens the door for future optimized rpfilter algorithms that do
not perform full FIB lookups.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h  |  1 +
 include/net/ip_fib.h     |  5 +++++
 net/core/fib_rules.c     |  4 ++++
 net/ipv4/fib_frontend.c  | 32 ++++++++++++++++++++++++--------
 net/ipv4/fib_rules.c     | 16 +++++++++++++++-
 net/ipv4/fib_semantics.c | 10 ++++++++++
 6 files changed, 59 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 075f1e3a0fed..e361f4882426 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -52,6 +52,7 @@ struct fib_rules_ops {
 					     struct sk_buff *,
 					     struct fib_rule_hdr *,
 					     struct nlattr **);
+	void			(*delete)(struct fib_rule *);
 	int			(*compare)(struct fib_rule *,
 					   struct fib_rule_hdr *,
 					   struct nlattr **);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 619f68a7185c..3dc7c96bbeab 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -235,6 +235,11 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       u8 tos, int oif, struct net_device *dev,
 			       struct in_device *idev, u32 *itag);
 extern void fib_select_default(struct fib_result *res);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+extern int fib_num_tclassid_users;
+#else
+#define fib_num_tclassid_users 0
+#endif
 
 /* Exported by fib_semantics.c */
 extern int ip_fib_check_default(__be32 gw, struct net_device *dev);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 72cceb79d0d4..ab7db83236c9 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 
 	list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
 		list_del_rcu(&rule->list);
+		if (ops->delete)
+			ops->delete(rule);
 		fib_rule_put(rule);
 	}
 }
@@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).pid);
+		if (ops->delete)
+			ops->delete(rule);
 		fib_rule_put(rule);
 		flush_route_cache(ops);
 		rules_ops_put(ops);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c84cff52021e..ae528d1b293a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -31,6 +31,7 @@
 #include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
+#include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -217,6 +218,10 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
+#ifdef CONFIG_IP_ROUTE_CLASSID
+int fib_num_tclassid_users __read_mostly;
+#endif
+
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
@@ -225,11 +230,11 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
  * - check, that packet arrived from expected physical interface.
  * called with rcu_read_lock()
  */
-int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
-			int oif, struct net_device *dev, struct in_device *idev,
-			u32 *itag)
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+				 u8 tos, int oif, struct net_device *dev,
+				 int rpf, struct in_device *idev, u32 *itag)
 {
-	int ret, no_addr, rpf, accept_local;
+	int ret, no_addr, accept_local;
 	struct fib_result res;
 	struct flowi4 fl4;
 	struct net *net;
@@ -242,12 +247,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
-	no_addr = rpf = accept_local = 0;
+	no_addr = accept_local = 0;
 	no_addr = idev->ifa_list == NULL;
 
-	/* Ignore rp_filter for packets protected by IPsec. */
-	rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
-
 	accept_local = IN_DEV_ACCEPT_LOCAL(idev);
 	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 
@@ -303,6 +305,20 @@ e_rpf:
 	return -EXDEV;
 }
 
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+			u8 tos, int oif, struct net_device *dev,
+			struct in_device *idev, u32 *itag)
+{
+	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+	if (!r && !fib_num_tclassid_users) {
+		*itag = 0;
+		return 0;
+	}
+	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
 {
 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2d043f71ef70..b23fd952c84f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 		rule4->dst = nla_get_be32(tb[FRA_DST]);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	if (tb[FRA_FLOW])
+	if (tb[FRA_FLOW]) {
 		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+		if (rule4->tclassid)
+			fib_num_tclassid_users++;
+	}
 #endif
 
 	rule4->src_len = frh->src_len;
@@ -184,6 +187,16 @@ errout:
 	return err;
 }
 
+static void fib4_rule_delete(struct fib_rule *rule)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	if (rule4->tclassid)
+		fib_num_tclassid_users--;
+#endif
+}
+
 static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 			     struct nlattr **tb)
 {
@@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
 	.action		= fib4_rule_action,
 	.match		= fib4_rule_match,
 	.configure	= fib4_rule_configure,
+	.delete		= fib4_rule_delete,
 	.compare	= fib4_rule_compare,
 	.fill		= fib4_rule_fill,
 	.default_pref	= fib_default_rule_pref,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 415f8230fc88..c46c20b6b0b6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi)
 		return;
 	}
 	fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	change_nexthops(fi) {
+		if (nexthop_nh->nh_tclassid)
+			fib_num_tclassid_users--;
+	} endfor_nexthops(fi);
+#endif
 	call_rcu(&fi->rcu, free_fib_info_rcu);
 }
 
@@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
 			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+			if (nexthop_nh->nh_tclassid)
+				fib_num_tclassid_users++;
 #endif
 		}
 
@@ -815,6 +823,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 		nh->nh_flags = cfg->fc_flags;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		nh->nh_tclassid = cfg->fc_flow;
+		if (nh->nh_tclassid)
+			fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		nh->nh_weight = 1;
-- 
cgit v1.2.3


From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 Jun 2012 06:15:21 +0000
Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create

This patch adds the following structure:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

That can be passed to netlink_kernel_create to set optional configurations
for netlink kernel sockets.

I've populated this structure by looking for NULL and zero parameters at the
existing code. The remaining parameters that always need to be set are still
left in the original interface.

That includes optional parameters for the netlink socket creation. This allows
easy extensibility of this interface in the future.

This patch also adapts all callers to use this new interface.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/crypto_user.c                |  7 +++++--
 drivers/connector/connector.c       | 13 +++++++++----
 drivers/infiniband/core/netlink.c   |  7 +++++--
 drivers/scsi/scsi_netlink.c         |  7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |  9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |  6 ++++--
 include/linux/netlink.h             | 15 ++++++++++-----
 kernel/audit.c                      |  7 +++++--
 lib/kobject_uevent.c                |  5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |  6 ++++--
 net/core/rtnetlink.c                |  9 +++++++--
 net/core/sock_diag.c                |  8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |  8 +++++---
 net/ipv4/fib_frontend.c             |  7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |  8 +++++---
 net/netfilter/nfnetlink.c           |  7 +++++--
 net/netlink/af_netlink.c            | 16 ++++++++++------
 net/netlink/genetlink.c             | 10 +++++++---
 net/xfrm/xfrm_user.c                |  7 +++++--
 security/selinux/netlink.c          |  6 +++++-
 20 files changed, 117 insertions(+), 51 deletions(-)

(limited to 'net/ipv4')

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 5a37eadb4e56..ba2c611154af 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb)
 
 static int __init crypto_user_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input	= crypto_netlink_rcv,
+	};
+
 	crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
-					    0, crypto_netlink_rcv,
-					    NULL, THIS_MODULE);
+					    THIS_MODULE, &cfg);
 	if (!crypto_nlsk)
 		return -ENOMEM;
 
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 34e0e9e4d913..116cf8d02834 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = {
 	.release = single_release
 };
 
+static struct cn_dev cdev = {
+	.input   = cn_rx_skb,
+};
+
 static int __devinit cn_init(void)
 {
 	struct cn_dev *dev = &cdev;
-
-	dev->input = cn_rx_skb;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= CN_NETLINK_USERS + 0xf,
+		.input	= dev->input,
+	};
 
 	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR,
-					 CN_NETLINK_USERS + 0xf,
-					 dev->input, NULL, THIS_MODULE);
+					 THIS_MODULE, &cfg);
 	if (!dev->nls)
 		return -EIO;
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 1e691dca1820..3ae2bfd31015 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb)
 
 int __init ibnl_init(void)
 {
-	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv,
-				    NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg);
 	if (!nls) {
 		pr_warn("Failed to create netlink socket\n");
 		return -ENOMEM;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index c77628afbf9f..8818dd681c19 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -486,6 +486,10 @@ void
 scsi_netlink_init(void)
 {
 	int error;
+	struct netlink_kernel_cfg cfg = {
+		.input	= scsi_nl_rcv_msg,
+		.groups	= SCSI_NL_GRP_CNT,
+	};
 
 	INIT_LIST_HEAD(&scsi_nl_drivers);
 
@@ -497,8 +501,7 @@ scsi_netlink_init(void)
 	}
 
 	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
-				SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL,
-				THIS_MODULE);
+					     THIS_MODULE, &cfg);
 	if (!scsi_nl_sock) {
 		printk(KERN_ERR "%s: register of receive handler failed\n",
 				__func__);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 1cf640e575da..6042954d8f3b 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 static __init int iscsi_transport_init(void)
 {
 	int err;
-
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+		.input	= iscsi_if_rx,
+	};
 	printk(KERN_INFO "Loading iSCSI transport class v%s.\n",
 		ISCSI_TRANSPORT_VERSION);
 
@@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void)
 	if (err)
 		goto unregister_conn_class;
 
-	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx,
-				    NULL, THIS_MODULE);
+	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI,
+				    THIS_MODULE, &cfg);
 	if (!nls) {
 		err = -ENOBUFS;
 		goto unregister_session_class;
diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c
index 2489bb5597ca..87c3a07ed80e 100644
--- a/drivers/staging/gdm72xx/netlink_k.c
+++ b/drivers/staging/gdm72xx/netlink_k.c
@@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type,
 						void *msg, int len))
 {
 	struct sock *sock;
+	struct netlink_kernel_cfg cfg = {
+		.input  = netlink_rcv,
+	};
 
 #if !defined(DEFINE_MUTEX)
 	init_MUTEX(&netlink_mutex);
 #endif
 
-	sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL,
-					THIS_MODULE);
+	sock = netlink_kernel_create(&init_net, unit, THIS_MODULE, &cfg);
 
 	if (sock)
 		rcv_cb = cb;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index ed33f0901bc2..6085e4919cb3 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -174,11 +174,16 @@ struct netlink_skb_parms {
 extern void netlink_table_grab(void);
 extern void netlink_table_ungrab(void);
 
-extern struct sock *netlink_kernel_create(struct net *net,
-					  int unit,unsigned int groups,
-					  void (*input)(struct sk_buff *skb),
-					  struct mutex *cb_mutex,
-					  struct module *module);
+/* optional Netlink kernel configuration parameters */
+struct netlink_kernel_cfg {
+	unsigned int	groups;
+	void		(*input)(struct sk_buff *skb);
+	struct mutex	*cb_mutex;
+};
+
+extern struct sock *netlink_kernel_create(struct net *net, int unit,
+					  struct module *module,
+					  struct netlink_kernel_cfg *cfg);
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
diff --git a/kernel/audit.c b/kernel/audit.c
index 30b252a1fb61..4a3f28d2ca65 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.input	= audit_receive,
+	};
 
 	if (audit_initialized == AUDIT_DISABLED)
 		return 0;
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
-					   audit_receive, NULL, THIS_MODULE);
+	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+					   THIS_MODULE, &cfg);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 	else
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1a91efa6d121..0401d2916d9f 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var);
 static int uevent_net_init(struct net *net)
 {
 	struct uevent_sock *ue_sk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+	};
 
 	ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
 	if (!ue_sk)
 		return -ENOMEM;
 
 	ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT,
-					  1, NULL, NULL, THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ue_sk->sk) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 1bd173218f7b..374bdcd77039 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void)
 {
 	int ret;
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= EBT_ULOG_MAXNLGROUPS,
+	};
 
 	if (nlbufsiz >= 128*1024) {
 		pr_warning("Netlink buffer has to be <= 128kB,"
@@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void)
 	}
 
 	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
-					  THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bc8a1cdaac98..2b325c340b44 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2353,8 +2353,13 @@ static struct notifier_block rtnetlink_dev_notifier = {
 static int __net_init rtnetlink_net_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
-				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.groups		= RTNLGRP_MAX,
+		.input		= rtnetlink_rcv,
+		.cb_mutex	= &rtnl_mutex,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg);
 	if (!sk)
 		return -ENOMEM;
 	net->rtnl = sk;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index ff2967acbfae..07a29eb34a41 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -171,8 +171,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk);
 
 static int __init sock_diag_init(void)
 {
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
-					sock_diag_rcv, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= sock_diag_rcv,
+	};
+
+	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+					       THIS_MODULE, &cfg);
 	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
 }
 
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index b8f7f5b8c350..11db0ecf342f 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = {
 static int __init dn_rtmsg_init(void)
 {
 	int rv = 0;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= DNRNG_NLGRP_MAX,
+		.input	= dnrmg_receive_user_skb,
+	};
 
 	dnrmg = netlink_kernel_create(&init_net,
-				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
-				      dnrmg_receive_user_skb,
-				      NULL, THIS_MODULE);
+				      NETLINK_DNRTMSG, THIS_MODULE, &cfg);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ae528d1b293a..3e11ea225dad 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -976,8 +976,11 @@ static void nl_fib_input(struct sk_buff *skb)
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
-				   nl_fib_input, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= nl_fib_input,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
 	if (sk == NULL)
 		return -EAFNOSUPPORT;
 	net->ipv4.fibnl = sk;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 99b3f53f16a7..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
 	int ret, i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= ULOG_MAXNLGROUPS,
+	};
 
 	pr_debug("init module\n");
 
@@ -393,9 +396,8 @@ static int __init ulog_tg_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net,
-					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
-					NULL, THIS_MODULE);
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					THIS_MODULE, &cfg);
 	if (!nflognl)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1fcb94..700e4616a098 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int __net_init nfnetlink_net_init(struct net *net)
 {
 	struct sock *nfnl;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= NFNLGRP_MAX,
+		.input	= nfnetlink_rcv,
+	};
 
-	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
-				     nfnetlink_rcv, NULL, THIS_MODULE);
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg);
 	if (!nfnl)
 		return -ENOMEM;
 	net->nfnl_stash = nfnl;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b3025a603d56..43a124feaad8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(struct net *net, int unit, unsigned int groups,
-		      void (*input)(struct sk_buff *skb),
-		      struct mutex *cb_mutex, struct module *module)
+netlink_kernel_create(struct net *net, int unit,
+		      struct module *module,
+		      struct netlink_kernel_cfg *cfg)
 {
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
 	struct listeners *listeners = NULL;
+	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
+	unsigned int groups;
 
 	BUG_ON(!nl_table);
 
@@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	if (groups < 32)
+	if (!cfg || cfg->groups < 32)
 		groups = 32;
+	else
+		groups = cfg->groups;
 
 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
 	if (!listeners)
 		goto out_sock_release;
 
 	sk->sk_data_ready = netlink_data_ready;
-	if (input)
-		nlk_sk(sk)->netlink_rcv = input;
+	if (cfg && cfg->input)
+		nlk_sk(sk)->netlink_rcv = cfg->input;
 
 	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2cc7c1ee7690..32761b53015e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = {
 
 static int __net_init genl_pernet_init(struct net *net)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input		= genl_rcv,
+		.cb_mutex	= &genl_mutex,
+	};
+
 	/* we'll bump the group number right afterwards */
-	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
-					       genl_rcv, &genl_mutex,
-					       THIS_MODULE);
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC,
+					       THIS_MODULE, &cfg);
 
 	if (!net->genl_sock && net_eq(net, &init_net))
 		panic("GENL: Cannot initialize generic netlink\n");
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 540762726aaf..e75d8e47f35c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2959,9 +2959,12 @@ static struct xfrm_mgr netlink_mgr = {
 static int __net_init xfrm_user_net_init(struct net *net)
 {
 	struct sock *nlsk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= XFRMNLGRP_MAX,
+		.input	= xfrm_netlink_rcv,
+	};
 
-	nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX,
-				     xfrm_netlink_rcv, NULL, THIS_MODULE);
+	nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg);
 	if (nlsk == NULL)
 		return -ENOMEM;
 	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 8a23a35b9c5b..8a77725423e0 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.groups	= SELNLGRP_MAX,
+	};
+
 	selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX,
-				      SELNLGRP_MAX, NULL, NULL, THIS_MODULE);
+				      THIS_MODULE, &cfg);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);
-- 
cgit v1.2.3


From 08911475d1d0921401e37d83292b217e1411d10b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 Jun 2012 05:23:24 +0000
Subject: netfilter: nf_conntrack: generalize nf_ct_l4proto_net

This patch generalizes nf_ct_l4proto_net by splitting it into chunks and
moving the corresponding protocol part to where it really belongs to.

To clarify, note that we follow two different approaches to support per-net
depending if it's built-in or run-time loadable protocol tracker.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  3 +++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  6 ++++++
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  6 ++++++
 net/netfilter/nf_conntrack_proto.c             | 22 ++++++----------------
 net/netfilter/nf_conntrack_proto_generic.c     |  6 ++++++
 net/netfilter/nf_conntrack_proto_tcp.c         |  7 +++++++
 net/netfilter/nf_conntrack_proto_udp.c         |  7 +++++++
 7 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 08bb571b7abd..c3be4aef6bf7 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -99,6 +99,9 @@ struct nf_conntrack_l4proto {
 	/* Init l4proto pernet data */
 	int (*init_net)(struct net *net, u_int16_t proto);
 
+	/* Return the per-net protocol part. */
+	struct nf_proto_net *(*get_net_proto)(struct net *net);
+
 	/* Protocol name */
 	const char *name;
 
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 9c2095c5571f..5241d997ab75 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -388,6 +388,11 @@ static int icmp_init_net(struct net *net, u_int16_t proto)
 	return ret;
 }
 
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+	return &net->ct.nf_ct_proto.icmp.pn;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -418,4 +423,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= icmp_init_net,
+	.get_net_proto		= icmp_get_net_proto,
 };
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 9fc5cf5f3e8b..2d54b2061d68 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -358,6 +358,11 @@ static int icmpv6_init_net(struct net *net, u_int16_t proto)
 	return icmpv6_kmemdup_sysctl_table(pn, in);
 }
 
+static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
+{
+	return &net->ct.nf_ct_proto.icmpv6.pn;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
@@ -386,4 +391,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= icmpv6_init_net,
+	.get_net_proto		= icmpv6_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 21b850c4b3ab..0dc63854390f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -303,22 +303,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
 static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 					      struct nf_conntrack_l4proto *l4proto)
 {
-	switch (l4proto->l4proto) {
-	case IPPROTO_TCP:
-		return (struct nf_proto_net *)&net->ct.nf_ct_proto.tcp;
-	case IPPROTO_UDP:
-		return (struct nf_proto_net *)&net->ct.nf_ct_proto.udp;
-	case IPPROTO_ICMP:
-		return (struct nf_proto_net *)&net->ct.nf_ct_proto.icmp;
-	case IPPROTO_ICMPV6:
-		return (struct nf_proto_net *)&net->ct.nf_ct_proto.icmpv6;
-	case 255: /* l4proto_generic */
-		return (struct nf_proto_net *)&net->ct.nf_ct_proto.generic;
-	default:
-		if (l4proto->net_id)
-			return net_generic(net, *l4proto->net_id);
-		else
-			return NULL;
+	if (l4proto->get_net_proto) {
+		/* statically built-in protocols use static per-net */
+		return l4proto->get_net_proto(net);
+	} else if (l4proto->net_id) {
+		/* ... and loadable protocols use dynamic per-net */
+		return net_generic(net, *l4proto->net_id);
 	}
 	return NULL;
 }
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 7c11c5444194..d25f29377648 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -186,6 +186,11 @@ static int generic_init_net(struct net *net, u_int16_t proto)
 	return ret;
 }
 
+static struct nf_proto_net *generic_get_net_proto(struct net *net)
+{
+	return &net->ct.nf_ct_proto.generic.pn;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 {
 	.l3proto		= PF_UNSPEC,
@@ -207,4 +212,5 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= generic_init_net,
+	.get_net_proto		= generic_get_net_proto,
 };
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 44f0da830156..07e56ea2e9bf 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1623,6 +1623,11 @@ static int tcp_init_net(struct net *net, u_int16_t proto)
 	return ret;
 }
 
+static struct nf_proto_net *tcp_get_net_proto(struct net *net)
+{
+	return &net->ct.nf_ct_proto.tcp.pn;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -1656,6 +1661,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= tcp_init_net,
+	.get_net_proto		= tcp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
 
@@ -1692,5 +1698,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= tcp_init_net,
+	.get_net_proto		= tcp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index e7e0434c3056..59623cc56e8d 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -297,6 +297,11 @@ static int udp_init_net(struct net *net, u_int16_t proto)
 	return ret;
 }
 
+static struct nf_proto_net *udp_get_net_proto(struct net *net)
+{
+	return &net->ct.nf_ct_proto.udp.pn;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -325,6 +330,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
@@ -356,5 +362,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.init_net		= udp_init_net,
+	.get_net_proto		= udp_get_net_proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
-- 
cgit v1.2.3


From 11604721a3c4bea60e2ddd9e4e30d741ecdba7b0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 4 Jul 2012 16:13:17 -0700
Subject: ipv4: Fix crashes in ip_options_compile().

The spec_dst uses should be guarded by skb_rtable() being non-NULL
not just the SKB being non-null.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_options.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 766dfe56885a..1f022510abe3 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -253,12 +253,15 @@ int ip_options_compile(struct net *net,
 {
 	__be32 spec_dst = (__force __be32) 0;
 	unsigned char *pp_ptr = NULL;
+	struct rtable *rt = NULL;
 	unsigned char *optptr;
 	unsigned char *iph;
 	int optlen, l;
 
 	if (skb != NULL) {
-		spec_dst = fib_compute_spec_dst(skb);
+		rt = skb_rtable(skb);
+		if (rt)
+			spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -330,7 +333,7 @@ int ip_options_compile(struct net *net,
 					pp_ptr = optptr + 2;
 					goto error;
 				}
-				if (skb) {
+				if (rt) {
 					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
@@ -372,7 +375,7 @@ int ip_options_compile(struct net *net,
 						goto error;
 					}
 					opt->ts = optptr - iph;
-					if (skb)  {
+					if (rt)  {
 						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}
-- 
cgit v1.2.3


From a263b3093641fb1ec377582c90986a7fd0625184 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Jul 2012 02:02:15 -0700
Subject: ipv4: Make neigh lookups directly in output packet path.

Do not use the dst cached neigh, we'll be getting rid of that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h       | 28 +++++++++++++++++++---------
 include/net/neighbour.h | 11 +++++++++--
 net/core/neighbour.c    | 12 +++++++-----
 net/ipv4/ip_output.c    | 12 ++++++++----
 net/ipv4/route.c        |  6 +-----
 5 files changed, 44 insertions(+), 25 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/arp.h b/include/net/arp.h
index 4a1f3fb562eb..4617d9841132 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -15,24 +15,34 @@ static inline u32 arp_hashfn(u32 key, const struct net_device *dev, u32 hash_rnd
 	return val * hash_rnd;
 }
 
-static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
+static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
 {
-	struct neigh_hash_table *nht;
+	struct neigh_hash_table *nht = rcu_dereference_bh(arp_tbl.nht);
 	struct neighbour *n;
 	u32 hash_val;
 
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(arp_tbl.nht);
+	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+		key = 0;
+
 	hash_val = arp_hashfn(key, dev, nht->hash_rnd[0]) >> (32 - nht->hash_shift);
 	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
 	     n != NULL;
 	     n = rcu_dereference_bh(n->next)) {
-		if (n->dev == dev && *(u32 *)n->primary_key == key) {
-			if (!atomic_inc_not_zero(&n->refcnt))
-				n = NULL;
-			break;
-		}
+		if (n->dev == dev && *(u32 *)n->primary_key == key)
+			return n;
 	}
+
+	return NULL;
+}
+
+static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv4_neigh_lookup_noref(dev, key);
+	if (n && !atomic_inc_not_zero(&n->refcnt))
+		n = NULL;
 	rcu_read_unlock_bh();
 
 	return n;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 6cdfeedb650b..e1d18bdeebb8 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -202,9 +202,16 @@ extern struct neighbour *	neigh_lookup(struct neigh_table *tbl,
 extern struct neighbour *	neigh_lookup_nodev(struct neigh_table *tbl,
 						   struct net *net,
 						   const void *pkey);
-extern struct neighbour *	neigh_create(struct neigh_table *tbl,
+extern struct neighbour *	__neigh_create(struct neigh_table *tbl,
+					       const void *pkey,
+					       struct net_device *dev,
+					       bool want_ref);
+static inline struct neighbour *neigh_create(struct neigh_table *tbl,
 					     const void *pkey,
-					     struct net_device *dev);
+					     struct net_device *dev)
+{
+	return __neigh_create(tbl, pkey, dev, true);
+}
 extern void			neigh_destroy(struct neighbour *neigh);
 extern int			__neigh_event_send(struct neighbour *neigh, struct sk_buff *skb);
 extern int			neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d81d026138f0..a793af9af150 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -474,8 +474,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 }
 EXPORT_SYMBOL(neigh_lookup_nodev);
 
-struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
-			       struct net_device *dev)
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+				 struct net_device *dev, bool want_ref)
 {
 	u32 hash_val;
 	int key_len = tbl->key_len;
@@ -535,14 +535,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	     n1 = rcu_dereference_protected(n1->next,
 			lockdep_is_held(&tbl->lock))) {
 		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
-			neigh_hold(n1);
+			if (want_ref)
+				neigh_hold(n1);
 			rc = n1;
 			goto out_tbl_unlock;
 		}
 	}
 
 	n->dead = 0;
-	neigh_hold(n);
+	if (want_ref)
+		neigh_hold(n);
 	rcu_assign_pointer(n->next,
 			   rcu_dereference_protected(nht->hash_buckets[hash_val],
 						     lockdep_is_held(&tbl->lock)));
@@ -558,7 +560,7 @@ out_neigh_release:
 	neigh_release(n);
 	goto out;
 }
-EXPORT_SYMBOL(neigh_create);
+EXPORT_SYMBOL(__neigh_create);
 
 static u32 pneigh_hash(const void *pkey, int key_len)
 {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2630900e480a..6e9a266a0535 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -170,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
+	u32 nexthop;
 
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -191,15 +192,18 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-	rcu_read_lock();
-	neigh = dst_get_neighbour_noref(dst);
+	rcu_read_lock_bh();
+	nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (neigh) {
 		int res = neigh_output(neigh, skb);
 
-		rcu_read_unlock();
+		rcu_read_unlock_bh();
 		return res;
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_bh();
 
 	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
 			    __func__);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a5afc715558..2f40363e2851 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1098,17 +1098,13 @@ static int slow_chain_length(const struct rtable *head)
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 {
-	static const __be32 inaddr_any = 0;
 	struct net_device *dev = dst->dev;
 	const __be32 *pkey = daddr;
 	const struct rtable *rt;
 	struct neighbour *n;
 
 	rt = (const struct rtable *) dst;
-
-	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
-		pkey = &inaddr_any;
-	else if (rt->rt_gateway)
+	if (rt->rt_gateway)
 		pkey = (const __be32 *) &rt->rt_gateway;
 
 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
-- 
cgit v1.2.3


From 3c521f2ba9646c5543963cbc2b9c9d3f02a82594 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Jul 2012 02:04:13 -0700
Subject: ipv4: Don't report neigh uptodate state in rtcache procfs.

Soon routes will not have a cached neigh attached, nor will we
be able to necessarily go directly to a neigh from an arbitrary
route.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2f40363e2851..bae36386e722 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -418,13 +418,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "HHUptod\tSpecDst");
 	else {
 		struct rtable *r = v;
-		struct neighbour *n;
-		int len, HHUptod;
-
-		rcu_read_lock();
-		n = dst_get_neighbour_noref(&r->dst);
-		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
-		rcu_read_unlock();
+		int len;
 
 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
@@ -438,9 +432,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 			      dst_metric(&r->dst, RTAX_RTTVAR)),
 			r->rt_key_tos,
-			-1,
-			HHUptod,
-			0, &len);
+			-1, 0, 0, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
 	}
-- 
cgit v1.2.3


From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Jul 2012 02:21:03 -0700
Subject: net: Do delayed neigh confirmation.

When a dst_confirm() happens, mark the confirmation as pending in the
dst.  Then on the next packet out, when we have the neigh in-hand, do
the update.

This removes the dependency in dst_confirm() of dst's having an
attached neigh.

While we're here, remove the explicit 'dst' NULL check, all except 2
or 3 call sites ensure it's not NULL.  So just fix those cases up.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h       | 29 +++++++++++++++++++++--------
 include/net/neighbour.h | 15 ---------------
 net/core/dst.c          |  3 ++-
 net/ipv4/ip_output.c    |  2 +-
 net/ipv4/tcp_input.c    | 19 +++++++++++++------
 net/ipv6/ip6_output.c   |  2 +-
 6 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst.h b/include/net/dst.h
index f0bf3b8d5911..84e7a3ff968d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -51,7 +51,7 @@ struct dst_entry {
 	int			(*input)(struct sk_buff *);
 	int			(*output)(struct sk_buff *);
 
-	int			flags;
+	unsigned short		flags;
 #define DST_HOST		0x0001
 #define DST_NOXFRM		0x0002
 #define DST_NOPOLICY		0x0004
@@ -62,6 +62,8 @@ struct dst_entry {
 #define DST_FAKE_RTABLE		0x0080
 #define DST_XFRM_TUNNEL		0x0100
 
+	unsigned short		pending_confirm;
+
 	short			error;
 	short			obsolete;
 	unsigned short		header_len;	/* more space at head required */
@@ -371,7 +373,8 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb)
 
 extern int dst_discard(struct sk_buff *skb);
 extern void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
-		       int initial_ref, int initial_obsolete, int flags);
+		       int initial_ref, int initial_obsolete,
+		       unsigned short flags);
 extern void __dst_free(struct dst_entry *dst);
 extern struct dst_entry *dst_destroy(struct dst_entry *dst);
 
@@ -395,14 +398,24 @@ static inline void dst_rcu_free(struct rcu_head *head)
 
 static inline void dst_confirm(struct dst_entry *dst)
 {
-	if (dst) {
-		struct neighbour *n;
+	dst->pending_confirm = 1;
+}
 
-		rcu_read_lock();
-		n = dst_get_neighbour_noref(dst);
-		neigh_confirm(n);
-		rcu_read_unlock();
+static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
+				   struct sk_buff *skb)
+{
+	struct hh_cache *hh;
+
+	if (unlikely(dst->pending_confirm)) {
+		n->confirmed = jiffies;
+		dst->pending_confirm = 0;
 	}
+
+	hh = &n->hh;
+	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
+		return neigh_hh_output(hh, skb);
+	else
+		return n->output(n, skb);
 }
 
 static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index e1d18bdeebb8..344d8988842a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -309,12 +309,6 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh)
 
 #define neigh_hold(n)	atomic_inc(&(n)->refcnt)
 
-static inline void neigh_confirm(struct neighbour *neigh)
-{
-	if (neigh)
-		neigh->confirmed = jiffies;
-}
-
 static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 {
 	unsigned long now = jiffies;
@@ -358,15 +352,6 @@ static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 	return dev_queue_xmit(skb);
 }
 
-static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
-{
-	struct hh_cache *hh = &n->hh;
-	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
-		return neigh_hh_output(hh, skb);
-	else
-		return n->output(n, skb);
-}
-
 static inline struct neighbour *
 __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
 {
diff --git a/net/core/dst.c b/net/core/dst.c
index 43d94cedbf7c..a6e19a23a745 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(dst_discard);
 const u32 dst_default_metrics[RTAX_MAX];
 
 void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
-		int initial_ref, int initial_obsolete, int flags)
+		int initial_ref, int initial_obsolete, unsigned short flags)
 {
 	struct dst_entry *dst;
 
@@ -188,6 +188,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
+	dst->pending_confirm = 0;
 	dst->next = NULL;
 	if (!(flags & DST_NOCOUNT))
 		dst_entries_add(ops, 1);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6e9a266a0535..cc52679790b2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,7 +198,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (neigh) {
-		int res = neigh_output(neigh, skb);
+		int res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8416f8a68e65..ca0d0e7c9778 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,13 +740,13 @@ void tcp_update_metrics(struct sock *sk)
 	if (sysctl_tcp_nometrics_save)
 		return;
 
-	dst_confirm(dst);
-
 	if (dst && (dst->flags & DST_HOST)) {
 		const struct inet_connection_sock *icsk = inet_csk(sk);
 		int m;
 		unsigned long rtt;
 
+		dst_confirm(dst);
+
 		if (icsk->icsk_backoff || !tp->srtt) {
 			/* This session failed to estimate rtt. Why?
 			 * Probably, no packets returned in time.
@@ -3869,9 +3869,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 			tcp_cong_avoid(sk, ack, prior_in_flight);
 	}
 
-	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
-		dst_confirm(__sk_dst_get(sk));
-
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+		struct dst_entry *dst = __sk_dst_get(sk);
+		if (dst)
+			dst_confirm(dst);
+	}
 	return 1;
 
 no_queue:
@@ -6140,9 +6142,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		case TCP_FIN_WAIT1:
 			if (tp->snd_una == tp->write_seq) {
+				struct dst_entry *dst;
+
 				tcp_set_state(sk, TCP_FIN_WAIT2);
 				sk->sk_shutdown |= SEND_SHUTDOWN;
-				dst_confirm(__sk_dst_get(sk));
+
+				dst = __sk_dst_get(sk);
+				if (dst)
+					dst_confirm(dst);
 
 				if (!sock_flag(sk, SOCK_DEAD))
 					/* Wake up lingering close() */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a233a7ccbc3a..c94e4aabe11b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -125,7 +125,7 @@ static int ip6_finish_output2(struct sk_buff *skb)
 	rcu_read_lock();
 	neigh = dst_get_neighbour_noref(dst);
 	if (neigh) {
-		int res = neigh_output(neigh, skb);
+		int res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock();
 		return res;
-- 
cgit v1.2.3


From f894cbf847c9bea1955095bf37aca6c050553167 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Jul 2012 21:52:24 -0700
Subject: net: Add optional SKB arg to dst_ops->neigh_lookup().

Causes the handler to use the daddr in the ipv4/ipv6 header when
the route gateway is unspecified (local subnet).

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h         |  8 +++++++-
 include/net/dst_ops.h     |  4 +++-
 net/bridge/br_netfilter.c |  4 +++-
 net/decnet/dn_route.c     |  8 ++++++--
 net/ipv4/route.c          | 14 ++++++++++----
 net/ipv6/route.c          | 14 ++++++++++----
 net/xfrm/xfrm_policy.c    |  6 ++++--
 7 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst.h b/include/net/dst.h
index 84e7a3ff968d..295a70547e7d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -420,7 +420,13 @@ static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
 
 static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 {
-	return dst->ops->neigh_lookup(dst, daddr);
+	return dst->ops->neigh_lookup(dst, NULL, daddr);
+}
+
+static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
+						     struct sk_buff *skb)
+{
+	return dst->ops->neigh_lookup(dst, skb, NULL);
 }
 
 static inline void dst_link_failure(struct sk_buff *skb)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 3682a0a076c1..4badc86e45d1 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -26,7 +26,9 @@ struct dst_ops {
 	void			(*link_failure)(struct sk_buff *);
 	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
 	int			(*local_out)(struct sk_buff *skb);
-	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst, const void *daddr);
+	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
+						struct sk_buff *skb,
+						const void *daddr);
 
 	struct kmem_cache	*kmem_cachep;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 20fa719889ee..4378775432b6 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -120,7 +120,9 @@ static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
 	return NULL;
 }
 
-static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
 {
 	return NULL;
 }
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2493ed5bfecd..60e4c6e1bac0 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -117,7 +117,9 @@ static void dn_dst_destroy(struct dst_entry *);
 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
 static void dn_dst_link_failure(struct sk_buff *);
 static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, const void *daddr);
+static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
+					     struct sk_buff *skb,
+					     const void *daddr);
 static int dn_route_input(struct sk_buff *);
 static void dn_run_flush(unsigned long dummy);
 
@@ -828,7 +830,9 @@ static unsigned int dn_dst_mtu(const struct dst_entry *dst)
 	return mtu ? : dst->dev->mtu;
 }
 
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
+					     struct sk_buff *skb,
+					     const void *daddr)
 {
 	return __neigh_lookup_errno(&dn_neigh_table, daddr, dst->dev);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bae36386e722..7453dfcdb439 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -188,7 +188,9 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 	return p;
 }
 
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr);
 
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
@@ -1088,7 +1090,9 @@ static int slow_chain_length(const struct rtable *head)
 	return length >> FRACT_BITS;
 }
 
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
 {
 	struct net_device *dev = dst->dev;
 	const __be32 *pkey = daddr;
@@ -1098,6 +1102,8 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
 	rt = (const struct rtable *) dst;
 	if (rt->rt_gateway)
 		pkey = (const __be32 *) &rt->rt_gateway;
+	else if (skb)
+		pkey = &ip_hdr(skb)->daddr;
 
 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 	if (n)
@@ -1107,7 +1113,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
 
 static int rt_bind_neighbour(struct rtable *rt)
 {
-	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
 	if (IS_ERR(n))
 		return PTR_ERR(n);
 	dst_set_neighbour(&rt->dst, n);
@@ -1388,7 +1394,7 @@ static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 
 	rt->rt_gateway = peer->redirect_learned.a4;
 
-	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+	n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
 	if (IS_ERR(n)) {
 		rt->rt_gateway = orig_gw;
 		return;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c518e4ec0cea..4b581c675bb2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -120,21 +120,27 @@ static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 	return p;
 }
 
-static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
+static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+					     struct sk_buff *skb,
+					     const void *daddr)
 {
 	struct in6_addr *p = &rt->rt6i_gateway;
 
 	if (!ipv6_addr_any(p))
 		return (const void *) p;
+	else if (skb)
+		return &ipv6_hdr(skb)->daddr;
 	return daddr;
 }
 
-static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
+					  struct sk_buff *skb,
+					  const void *daddr)
 {
 	struct rt6_info *rt = (struct rt6_info *) dst;
 	struct neighbour *n;
 
-	daddr = choose_neigh_daddr(rt, daddr);
+	daddr = choose_neigh_daddr(rt, skb, daddr);
 	n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
 	if (n)
 		return n;
@@ -1162,7 +1168,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	if (neigh)
 		neigh_hold(neigh);
 	else {
-		neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
+		neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
 		if (IS_ERR(neigh)) {
 			in6_dev_put(idev);
 			dst_free(&rt->dst);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ccfbd328a69d..a28a3f972d5b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2404,9 +2404,11 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst)
 	return mtu ? : dst_mtu(dst->path);
 }
 
-static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
 {
-	return dst_neigh_lookup(dst->path, daddr);
+	return dst->path->ops->neigh_lookup(dst, skb, daddr);
 }
 
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
-- 
cgit v1.2.3


From f187bc6efb7250afee0e2009b6106370319b0c8b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 3 Jul 2012 01:07:44 -0700
Subject: ipv4: No need to set generic neighbour pointer.

Nobody reads it any longer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 62 +++-----------------------------------------------------
 1 file changed, 3 insertions(+), 59 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7453dfcdb439..72e88c208025 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1111,16 +1111,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
-static int rt_bind_neighbour(struct rtable *rt)
-{
-	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
-	if (IS_ERR(n))
-		return PTR_ERR(n);
-	dst_set_neighbour(&rt->dst, n);
-
-	return 0;
-}
-
 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
 				     struct sk_buff *skb, int ifindex)
 {
@@ -1129,7 +1119,6 @@ static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
 	unsigned long	now;
 	u32 		min_score;
 	int		chain_length;
-	int attempts = !in_softirq();
 
 restart:
 	chain_length = 0;
@@ -1156,15 +1145,6 @@ restart:
 		 */
 
 		rt->dst.flags |= DST_NOCACHE;
-		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-			int err = rt_bind_neighbour(rt);
-			if (err) {
-				net_warn_ratelimited("Neighbour table failure & not caching routes\n");
-				ip_rt_put(rt);
-				return ERR_PTR(err);
-			}
-		}
-
 		goto skip_hashing;
 	}
 
@@ -1247,40 +1227,6 @@ restart:
 		}
 	}
 
-	/* Try to bind route to arp only if it is output
-	   route or unicast forwarding path.
-	 */
-	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-		int err = rt_bind_neighbour(rt);
-		if (err) {
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			if (err != -ENOBUFS) {
-				rt_drop(rt);
-				return ERR_PTR(err);
-			}
-
-			/* Neighbour tables are full and nothing
-			   can be released. Try to shrink route cache,
-			   it is most likely it holds some neighbour records.
-			 */
-			if (attempts-- > 0) {
-				int saved_elasticity = ip_rt_gc_elasticity;
-				int saved_int = ip_rt_gc_min_interval;
-				ip_rt_gc_elasticity	= 1;
-				ip_rt_gc_min_interval	= 0;
-				rt_garbage_collect(&ipv4_dst_ops);
-				ip_rt_gc_min_interval	= saved_int;
-				ip_rt_gc_elasticity	= saved_elasticity;
-				goto restart;
-			}
-
-			net_warn_ratelimited("Neighbour table overflow\n");
-			rt_drop(rt);
-			return ERR_PTR(-ENOBUFS);
-		}
-	}
-
 	rt->dst.rt_next = rt_hash_table[hash].chain;
 
 	/*
@@ -1388,26 +1334,24 @@ static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 {
 	struct rtable *rt = (struct rtable *) dst;
 	__be32 orig_gw = rt->rt_gateway;
-	struct neighbour *n, *old_n;
+	struct neighbour *n;
 
 	dst_confirm(&rt->dst);
 
 	rt->rt_gateway = peer->redirect_learned.a4;
 
 	n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
-	if (IS_ERR(n)) {
+	if (!n) {
 		rt->rt_gateway = orig_gw;
 		return;
 	}
-	old_n = xchg(&rt->dst._neighbour, n);
-	if (old_n)
-		neigh_release(old_n);
 	if (!(n->nud_state & NUD_VALID)) {
 		neigh_event_send(n, NULL);
 	} else {
 		rt->rt_flags |= RTCF_REDIRECTED;
 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 	}
+	neigh_release(n);
 }
 
 /* called in rcu_read_lock() section */
-- 
cgit v1.2.3


From bf5e53e3717ed28be69d0663c65962d1731e7ee4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Jul 2012 22:30:09 +0000
Subject: ipv4: defer fib_compute_spec_dst() call

ip_options_compile() can avoid calling fib_compute_spec_dst()
by default, and perform the call only if needed.

David suggested to add a helper to make the call only once.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_options.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1f022510abe3..a19d6471a318 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -242,6 +242,15 @@ void ip_options_fragment(struct sk_buff *skb)
 	opt->ts_needtime = 0;
 }
 
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+	if (*spec_dst == htonl(INADDR_ANY))
+		*spec_dst = fib_compute_spec_dst(skb);
+}
+
 /*
  * Verify options and fill pointers in struct options.
  * Caller should clear *opt, and set opt->data.
@@ -251,7 +260,7 @@ void ip_options_fragment(struct sk_buff *skb)
 int ip_options_compile(struct net *net,
 		       struct ip_options *opt, struct sk_buff *skb)
 {
-	__be32 spec_dst = (__force __be32) 0;
+	__be32 spec_dst = htonl(INADDR_ANY);
 	unsigned char *pp_ptr = NULL;
 	struct rtable *rt = NULL;
 	unsigned char *optptr;
@@ -260,8 +269,6 @@ int ip_options_compile(struct net *net,
 
 	if (skb != NULL) {
 		rt = skb_rtable(skb);
-		if (rt)
-			spec_dst = fib_compute_spec_dst(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -334,6 +341,7 @@ int ip_options_compile(struct net *net,
 					goto error;
 				}
 				if (rt) {
+					spec_dst_fill(&spec_dst, skb);
 					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 					opt->is_changed = 1;
 				}
@@ -376,6 +384,7 @@ int ip_options_compile(struct net *net,
 					}
 					opt->ts = optptr - iph;
 					if (rt)  {
+						spec_dst_fill(&spec_dst, skb);
 						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
 						timeptr = &optptr[optptr[2]+3];
 					}
-- 
cgit v1.2.3


From f4530fa574df4d833506c53697ed1daa0d390bf4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 5 Jul 2012 22:13:13 -0700
Subject: ipv4: Avoid overhead when no custom FIB rules are installed.

If the user hasn't actually installed any custom rules, or fiddled
with the default ones, don't go through the whole FIB rules layer.

It's just pure overhead.

Instead do what we do with CONFIG_IP_MULTIPLE_TABLES disabled, check
the individual tables by hand, one by one.

Also, move fib_num_tclassid_users into the ipv4 network namespace.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     | 36 ++++++++++++++++++++++++++++++++----
 include/net/netns/ipv4.h |  8 ++++++++
 net/ipv4/fib_frontend.c  | 27 ++++++++++++++++++++++-----
 net/ipv4/fib_rules.c     | 12 ++++++++----
 net/ipv4/fib_semantics.c |  6 +++---
 5 files changed, 73 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 3dc7c96bbeab..539c6721f810 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -220,11 +220,33 @@ extern void __net_exit fib4_rules_exit(struct net *net);
 extern u32 fib_rules_tclass(const struct fib_result *res);
 #endif
 
-extern int fib_lookup(struct net *n, struct flowi4 *flp, struct fib_result *res);
-
 extern struct fib_table *fib_new_table(struct net *net, u32 id);
 extern struct fib_table *fib_get_table(struct net *net, u32 id);
 
+extern int __fib_lookup(struct net *net, struct flowi4 *flp,
+			struct fib_result *res);
+
+static inline int fib_lookup(struct net *net, struct flowi4 *flp,
+			     struct fib_result *res)
+{
+	if (!net->ipv4.fib_has_custom_rules) {
+		if (net->ipv4.fib_local &&
+		    !fib_table_lookup(net->ipv4.fib_local, flp, res,
+				      FIB_LOOKUP_NOREF))
+			return 0;
+		if (net->ipv4.fib_main &&
+		    !fib_table_lookup(net->ipv4.fib_main, flp, res,
+				      FIB_LOOKUP_NOREF))
+			return 0;
+		if (net->ipv4.fib_default &&
+		    !fib_table_lookup(net->ipv4.fib_default, flp, res,
+				      FIB_LOOKUP_NOREF))
+			return 0;
+		return -ENETUNREACH;
+	}
+	return __fib_lookup(net, flp, res);
+}
+
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 /* Exported by fib_frontend.c */
@@ -236,9 +258,15 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			       struct in_device *idev, u32 *itag);
 extern void fib_select_default(struct fib_result *res);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-extern int fib_num_tclassid_users;
+static inline int fib_num_tclassid_users(struct net *net)
+{
+	return net->ipv4.fib_num_tclassid_users;
+}
 #else
-#define fib_num_tclassid_users 0
+static inline int fib_num_tclassid_users(struct net *net)
+{
+	return 0;
+}
 #endif
 
 /* Exported by fib_semantics.c */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 227f0cd9d3f6..599e48fa97cb 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -11,6 +11,7 @@ struct ctl_table_header;
 struct ipv4_devconf;
 struct fib_rules_ops;
 struct hlist_head;
+struct fib_table;
 struct sock;
 
 struct netns_ipv4 {
@@ -24,6 +25,13 @@ struct netns_ipv4 {
 	struct ipv4_devconf	*devconf_dflt;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
+	bool			fib_has_custom_rules;
+	struct fib_table	*fib_local;
+	struct fib_table	*fib_main;
+	struct fib_table	*fib_default;
+#endif
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	int			fib_num_tclassid_users;
 #endif
 	struct hlist_head	*fib_table_hash;
 	struct sock		*fibnl;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3e11ea225dad..81f85716a894 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -86,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
 	tb = fib_trie_table(id);
 	if (!tb)
 		return NULL;
+
+	switch (id) {
+	case RT_TABLE_LOCAL:
+		net->ipv4.fib_local = tb;
+		break;
+
+	case RT_TABLE_MAIN:
+		net->ipv4.fib_main = tb;
+		break;
+
+	case RT_TABLE_DEFAULT:
+		net->ipv4.fib_default = tb;
+		break;
+
+	default:
+		break;
+	}
+
 	h = id & (FIB_TABLE_HASHSZ - 1);
 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 	return tb;
@@ -218,10 +236,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
-#ifdef CONFIG_IP_ROUTE_CLASSID
-int fib_num_tclassid_users __read_mostly;
-#endif
-
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
@@ -312,7 +326,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 {
 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 
-	if (!r && !fib_num_tclassid_users) {
+	if (!r && !fib_num_tclassid_users(dev_net(dev))) {
 		*itag = 0;
 		return 0;
 	}
@@ -1134,6 +1148,9 @@ static int __net_init fib_net_init(struct net *net)
 {
 	int error;
 
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	net->ipv4.fib_num_tclassid_users = 0;
+#endif
 	error = ip_fib_net_init(net);
 	if (error < 0)
 		goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b23fd952c84f..c06da93b0b70 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -54,7 +54,7 @@ u32 fib_rules_tclass(const struct fib_result *res)
 }
 #endif
 
-int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_lookup_arg arg = {
 		.result = res,
@@ -67,7 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(fib_lookup);
+EXPORT_SYMBOL_GPL(__fib_lookup);
 
 static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
@@ -172,7 +172,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	if (tb[FRA_FLOW]) {
 		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
 		if (rule4->tclassid)
-			fib_num_tclassid_users++;
+			net->ipv4.fib_num_tclassid_users++;
 	}
 #endif
 
@@ -182,6 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	rule4->dstmask = inet_make_mask(rule4->dst_len);
 	rule4->tos = frh->tos;
 
+	net->ipv4.fib_has_custom_rules = true;
 	err = 0;
 errout:
 	return err;
@@ -189,12 +190,14 @@ errout:
 
 static void fib4_rule_delete(struct fib_rule *rule)
 {
+	struct net *net = rule->fr_net;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
 	if (rule4->tclassid)
-		fib_num_tclassid_users--;
+		net->ipv4.fib_num_tclassid_users--;
 #endif
+	net->ipv4.fib_has_custom_rules = true;
 }
 
 static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
@@ -309,6 +312,7 @@ int __net_init fib4_rules_init(struct net *net)
 	if (err < 0)
 		goto fail;
 	net->ipv4.rules_ops = ops;
+	net->ipv4.fib_has_custom_rules = false;
 	return 0;
 
 fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c46c20b6b0b6..ae301c897a19 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -166,7 +166,7 @@ void free_fib_info(struct fib_info *fi)
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_tclassid)
-			fib_num_tclassid_users--;
+			fi->fib_net->ipv4.fib_num_tclassid_users--;
 	} endfor_nexthops(fi);
 #endif
 	call_rcu(&fi->rcu, free_fib_info_rcu);
@@ -428,7 +428,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
 			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 			if (nexthop_nh->nh_tclassid)
-				fib_num_tclassid_users++;
+				fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 		}
 
@@ -824,7 +824,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		nh->nh_tclassid = cfg->fc_flow;
 		if (nh->nh_tclassid)
-			fib_num_tclassid_users++;
+			fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		nh->nh_weight = 1;
-- 
cgit v1.2.3


From 4aabd8ef8c43677cfee3e1e36c5a79edddb41942 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 9 Jul 2012 16:07:30 -0700
Subject: tcp: Move dynamnic metrics handling into seperate file.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |   4 ++
 net/ipv4/Makefile      |   2 +-
 net/ipv4/tcp_input.c   | 188 +-----------------------------------------------
 net/ipv4/tcp_metrics.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 199 insertions(+), 187 deletions(-)
 create mode 100644 net/ipv4/tcp_metrics.c

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d814170..98ca797001a2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,8 @@ extern void tcp_enter_frto(struct sock *sk);
 extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
+extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
 extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +558,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return (tp->srtt >> 3) + tp->rttvar;
 }
 
+extern void tcp_set_rto(struct sock *sk);
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..5a23e8b37106 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7c9778..055ac49b8b40 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
 
 int sysctl_tcp_thin_dupack __read_mostly;
 
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
 	tcp_bound_rto(sk);
 }
 
-/* Save metrics learned by this TCP session.
-   This function is called only, when TCP finishes successfully
-   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = __sk_dst_get(sk);
-
-	if (sysctl_tcp_nometrics_save)
-		return;
-
-	if (dst && (dst->flags & DST_HOST)) {
-		const struct inet_connection_sock *icsk = inet_csk(sk);
-		int m;
-		unsigned long rtt;
-
-		dst_confirm(dst);
-
-		if (icsk->icsk_backoff || !tp->srtt) {
-			/* This session failed to estimate rtt. Why?
-			 * Probably, no packets returned in time.
-			 * Reset our results.
-			 */
-			if (!(dst_metric_locked(dst, RTAX_RTT)))
-				dst_metric_set(dst, RTAX_RTT, 0);
-			return;
-		}
-
-		rtt = dst_metric_rtt(dst, RTAX_RTT);
-		m = rtt - tp->srtt;
-
-		/* If newly calculated rtt larger than stored one,
-		 * store new one. Otherwise, use EWMA. Remember,
-		 * rtt overestimation is always better than underestimation.
-		 */
-		if (!(dst_metric_locked(dst, RTAX_RTT))) {
-			if (m <= 0)
-				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
-			else
-				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
-		}
-
-		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
-			unsigned long var;
-			if (m < 0)
-				m = -m;
-
-			/* Scale deviation to rttvar fixed point */
-			m >>= 1;
-			if (m < tp->mdev)
-				m = tp->mdev;
-
-			var = dst_metric_rtt(dst, RTAX_RTTVAR);
-			if (m >= var)
-				var = m;
-			else
-				var -= (var - m) >> 2;
-
-			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
-		}
-
-		if (tcp_in_initial_slowstart(tp)) {
-			/* Slow start still did not finish. */
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
-			if (!dst_metric_locked(dst, RTAX_CWND) &&
-			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
-		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   icsk->icsk_ca_state == TCP_CA_Open) {
-			/* Cong. avoidance phase, cwnd is reliable. */
-			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH,
-					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_cwnd) >> 1);
-		} else {
-			/* Else slow start did not finish, cwnd is non-sense,
-			   ssthresh may be also invalid.
-			 */
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_ssthresh) >> 1);
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
-		}
-
-		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
-			    tp->reordering != sysctl_tcp_reordering)
-				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
-		}
-	}
-}
-
 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
  * Packet counting of FACK is based on in-order assumptions, therefore TCP
  * disables it when reordering is detected
  */
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
 {
 	/* RFC3517 uses different metric in lost marker => reset on change */
 	if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 }
 
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = __sk_dst_get(sk);
-
-	if (dst == NULL)
-		goto reset;
-
-	dst_confirm(dst);
-
-	if (dst_metric_locked(dst, RTAX_CWND))
-		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
-	if (dst_metric(dst, RTAX_SSTHRESH)) {
-		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
-		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-			tp->snd_ssthresh = tp->snd_cwnd_clamp;
-	} else {
-		/* ssthresh may have been reduced unnecessarily during.
-		 * 3WHS. Restore it back to its initial default.
-		 */
-		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-	}
-	if (dst_metric(dst, RTAX_REORDERING) &&
-	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
-		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
-		tp->reordering = dst_metric(dst, RTAX_REORDERING);
-	}
-
-	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
-		goto reset;
-
-	/* Initial rtt is determined from SYN,SYN-ACK.
-	 * The segment is small and rtt may appear much
-	 * less than real one. Use per-dst memory
-	 * to make it more realistic.
-	 *
-	 * A bit of theory. RTT is time passed after "normal" sized packet
-	 * is sent until it is ACKed. In normal circumstances sending small
-	 * packets force peer to delay ACKs and calculation is correct too.
-	 * The algorithm is adaptive and, provided we follow specs, it
-	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
-	 * tricks sort of "quick acks" for time long enough to decrease RTT
-	 * to low value, and then abruptly stops to do it and starts to delay
-	 * ACKs, wait for troubles.
-	 */
-	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
-		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
-		tp->rtt_seq = tp->snd_nxt;
-	}
-	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
-		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
-		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-	}
-	tcp_set_rto(sk);
-reset:
-	if (tp->srtt == 0) {
-		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
-		 * 3WHS. This is most likely due to retransmission,
-		 * including spurious one. Reset the RTO back to 3secs
-		 * from the more aggressive 1sec to avoid more spurious
-		 * retransmission.
-		 */
-		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
-		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
-	}
-	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
-	 * retransmitted. In light of RFC6298 more aggressive 1sec
-	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
-	 * retransmission has occurred.
-	 */
-	if (tp->total_retrans > 1)
-		tp->snd_cwnd = 1;
-	else
-		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
 static void tcp_update_reordering(struct sock *sk, const int metric,
 				  const int ts)
 {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..2793ecf928d3
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,192 @@
+#include <linux/cache.h>
+#include <linux/tcp.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+/* Save metrics learned by this TCP session.  This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (sysctl_tcp_nometrics_save)
+		return;
+
+	if (dst && (dst->flags & DST_HOST)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		int m;
+		unsigned long rtt;
+
+		dst_confirm(dst);
+
+		if (icsk->icsk_backoff || !tp->srtt) {
+			/* This session failed to estimate rtt. Why?
+			 * Probably, no packets returned in time.
+			 * Reset our results.
+			 */
+			if (!(dst_metric_locked(dst, RTAX_RTT)))
+				dst_metric_set(dst, RTAX_RTT, 0);
+			return;
+		}
+
+		rtt = dst_metric_rtt(dst, RTAX_RTT);
+		m = rtt - tp->srtt;
+
+		/* If newly calculated rtt larger than stored one,
+		 * store new one. Otherwise, use EWMA. Remember,
+		 * rtt overestimation is always better than underestimation.
+		 */
+		if (!(dst_metric_locked(dst, RTAX_RTT))) {
+			if (m <= 0)
+				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
+			else
+				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
+		}
+
+		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+			unsigned long var;
+			if (m < 0)
+				m = -m;
+
+			/* Scale deviation to rttvar fixed point */
+			m >>= 1;
+			if (m < tp->mdev)
+				m = tp->mdev;
+
+			var = dst_metric_rtt(dst, RTAX_RTTVAR);
+			if (m >= var)
+				var = m;
+			else
+				var -= (var - m) >> 2;
+
+			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
+		}
+
+		if (tcp_in_initial_slowstart(tp)) {
+			/* Slow start still did not finish. */
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
+			if (!dst_metric_locked(dst, RTAX_CWND) &&
+			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
+		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+			   icsk->icsk_ca_state == TCP_CA_Open) {
+			/* Cong. avoidance phase, cwnd is reliable. */
+			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH,
+					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_cwnd) >> 1);
+		} else {
+			/* Else slow start did not finish, cwnd is non-sense,
+			   ssthresh may be also invalid.
+			 */
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_ssthresh) >> 1);
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
+		}
+
+		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+			    tp->reordering != sysctl_tcp_reordering)
+				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+		}
+	}
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (dst == NULL)
+		goto reset;
+
+	dst_confirm(dst);
+
+	if (dst_metric_locked(dst, RTAX_CWND))
+		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+	if (dst_metric(dst, RTAX_SSTHRESH)) {
+		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	} else {
+		/* ssthresh may have been reduced unnecessarily during.
+		 * 3WHS. Restore it back to its initial default.
+		 */
+		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	}
+	if (dst_metric(dst, RTAX_REORDERING) &&
+	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+		tcp_disable_fack(tp);
+		tcp_disable_early_retrans(tp);
+		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+	}
+
+	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+		goto reset;
+
+	/* Initial rtt is determined from SYN,SYN-ACK.
+	 * The segment is small and rtt may appear much
+	 * less than real one. Use per-dst memory
+	 * to make it more realistic.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal circumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
+	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
+		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+		tp->rtt_seq = tp->snd_nxt;
+	}
+	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
+		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+	}
+	tcp_set_rto(sk);
+reset:
+	if (tp->srtt == 0) {
+		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
+		 * 3WHS. This is most likely due to retransmission,
+		 * including spurious one. Reset the RTO back to 3secs
+		 * from the more aggressive 1sec to avoid more spurious
+		 * retransmission.
+		 */
+		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+	}
+	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC6298 more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1)
+		tp->snd_cwnd = 1;
+	else
+		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
-- 
cgit v1.2.3


From ab92bb2f679d66c7e12a6b1c0cdd76fe308f6546 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 9 Jul 2012 16:19:30 -0700
Subject: tcp: Abstract back handling peer aliveness test into helper function.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  1 +
 net/ipv4/tcp_ipv4.c    |  2 +-
 net/ipv4/tcp_metrics.c | 10 ++++++++++
 net/ipv6/tcp_ipv6.c    |  2 +-
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 98ca797001a2..5478356ea8c5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa21d05..e9312a8f33a1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1405,7 +1405,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
-			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			 !tcp_peer_is_proven(req, dst)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2793ecf928d3..9afe703c85cc 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,7 +1,9 @@
+#include <linux/module.h>
 #include <linux/cache.h>
 #include <linux/tcp.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/request_sock.h>
 #include <net/sock.h>
 #include <net/dst.h>
 #include <net/tcp.h>
@@ -190,3 +192,11 @@ reset:
 		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+{
+	if (!dst)
+		return false;
+	return dst_metric(dst, RTAX_RTT) ? true : false;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6cc67ed6c2e6..75d179555c28 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1177,7 +1177,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
-			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			 !tcp_peer_is_proven(req, dst)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
-- 
cgit v1.2.3


From 51c5d0c4b169bf762f09e0d5b283a7f0b2a45739 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 00:49:14 -0700
Subject: tcp: Maintain dynamic metrics in local cache.

Maintain a local hash table of TCP dynamic metrics blobs.

Computed TCP metrics are no longer maintained in the route metrics.

The table uses RCU and an extremely simple hash so that it has low
latency and low overhead.  A simple hash is legitimate because we only
make metrics blobs for fully established connections.

Some tweaking of the default hash table sizes, metric timeouts, and
the hash chain length limit certainly could use some tweaking.  But
the basic design seems sound.

With help from Eric Dumazet and Joe Perches.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |   3 +
 include/net/tcp.h        |   1 +
 net/ipv4/tcp.c           |   2 +
 net/ipv4/tcp_metrics.c   | 555 +++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 468 insertions(+), 93 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 599e48fa97cb..2e089a99d603 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
 
 #include <net/inet_frag.h>
 
+struct tcpm_hash_bucket;
 struct ctl_table_header;
 struct ipv4_devconf;
 struct fib_rules_ops;
@@ -39,6 +40,8 @@ struct netns_ipv4 {
 	struct sock		**icmp_sk;
 	struct sock		*tcp_sock;
 	struct inet_peer_base	*peers;
+	struct tcpm_hash_bucket	*tcp_metrics_hash;
+	unsigned int		tcp_metrics_hash_mask;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5478356ea8c5..0900d63d1627 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_metrics_init(void);
 extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..29aa0c800cd0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3563,6 +3563,8 @@ void __init tcp_init(void)
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
+	tcp_metrics_init();
+
 	tcp_register_congestion_control(&tcp_reno);
 
 	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9afe703c85cc..56223bab251b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,134 +1,431 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
 #include <linux/tcp.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
 #include <net/request_sock.h>
+#include <net/inetpeer.h>
 #include <net/sock.h>
+#include <net/ipv6.h>
 #include <net/dst.h>
 #include <net/tcp.h>
 
 int sysctl_tcp_nometrics_save __read_mostly;
 
+enum tcp_metric_index {
+	TCP_METRIC_RTT,
+	TCP_METRIC_RTTVAR,
+	TCP_METRIC_SSTHRESH,
+	TCP_METRIC_CWND,
+	TCP_METRIC_REORDERING,
+
+	/* Always last.  */
+	TCP_METRIC_MAX,
+};
+
+struct tcp_metrics_block {
+	struct tcp_metrics_block __rcu	*tcpm_next;
+	struct inetpeer_addr		tcpm_addr;
+	unsigned long			tcpm_stamp;
+	u32				tcpm_lock;
+	u32				tcpm_vals[TCP_METRIC_MAX];
+};
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+			      enum tcp_metric_index idx)
+{
+	return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+			  enum tcp_metric_index idx)
+{
+	return tm->tcpm_vals[idx];
+}
+
+static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
+				  enum tcp_metric_index idx)
+{
+	return msecs_to_jiffies(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+			   enum tcp_metric_index idx,
+			   u32 val)
+{
+	tm->tcpm_vals[idx] = val;
+}
+
+static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
+				 enum tcp_metric_index idx,
+				 u32 val)
+{
+	tm->tcpm_vals[idx] = jiffies_to_msecs(val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+		      const struct inetpeer_addr *b)
+{
+	const struct in6_addr *a6, *b6;
+
+	if (a->family != b->family)
+		return false;
+	if (a->family == AF_INET)
+		return a->addr.a4 == b->addr.a4;
+
+	a6 = (const struct in6_addr *) &a->addr.a6[0];
+	b6 = (const struct in6_addr *) &b->addr.a6[0];
+
+	return ipv6_addr_equal(a6, b6);
+}
+
+struct tcpm_hash_bucket {
+	struct tcp_metrics_block __rcu	*chain;
+};
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+	u32 val;
+
+	val = 0;
+	if (dst_metric_locked(dst, RTAX_RTT))
+		val |= 1 << TCP_METRIC_RTT;
+	if (dst_metric_locked(dst, RTAX_RTTVAR))
+		val |= 1 << TCP_METRIC_RTTVAR;
+	if (dst_metric_locked(dst, RTAX_SSTHRESH))
+		val |= 1 << TCP_METRIC_SSTHRESH;
+	if (dst_metric_locked(dst, RTAX_CWND))
+		val |= 1 << TCP_METRIC_CWND;
+	if (dst_metric_locked(dst, RTAX_REORDERING))
+		val |= 1 << TCP_METRIC_REORDERING;
+	tm->tcpm_lock = val;
+
+	tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+	tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+}
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+					  struct inetpeer_addr *addr,
+					  unsigned int hash,
+					  bool reclaim)
+{
+	struct tcp_metrics_block *tm;
+	struct net *net;
+
+	spin_lock_bh(&tcp_metrics_lock);
+	net = dev_net(dst->dev);
+	if (unlikely(reclaim)) {
+		struct tcp_metrics_block *oldest;
+
+		oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+		for (tm = rcu_dereference(oldest->tcpm_next); tm;
+		     tm = rcu_dereference(tm->tcpm_next)) {
+			if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+				oldest = tm;
+		}
+		tm = oldest;
+	} else {
+		tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+		if (!tm)
+			goto out_unlock;
+	}
+	tm->tcpm_addr = *addr;
+	tm->tcpm_stamp = jiffies;
+
+	tcpm_suck_dst(tm, dst);
+
+	if (likely(!reclaim)) {
+		tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+		rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+	}
+
+out_unlock:
+	spin_unlock_bh(&tcp_metrics_lock);
+	return tm;
+}
+
+#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+		tcpm_suck_dst(tm, dst);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH	5
+#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+	if (tm)
+		return tm;
+	if (depth > TCP_METRICS_RECLAIM_DEPTH)
+		return TCP_METRICS_RECLAIM_PTR;
+	return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+						   struct net *net, unsigned int hash)
+{
+	struct tcp_metrics_block *tm;
+	int depth = 0;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, addr))
+			break;
+		depth++;
+	}
+	return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+						       struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+
+	addr.family = req->rsk_ops->family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = inet_rsk(req)->rmt_addr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = dev_net(dst->dev);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, &addr))
+			break;
+	}
+	tcpm_check_stamp(tm, dst);
+	return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+						 struct dst_entry *dst,
+						 bool create)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+	bool reclaim;
+
+	addr.family = sk->sk_family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = inet_sk(sk)->inet_daddr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = dev_net(dst->dev);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	tm = __tcp_get_metrics(&addr, net, hash);
+	reclaim = false;
+	if (tm == TCP_METRICS_RECLAIM_PTR) {
+		reclaim = true;
+		tm = NULL;
+	}
+	if (!tm && create)
+		tm = tcpm_new(dst, &addr, hash, reclaim);
+	else
+		tcpm_check_stamp(tm, dst);
+
+	return tm;
+}
+
 /* Save metrics learned by this TCP session.  This function is called
  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
  * or goes from LAST-ACK to CLOSE.
  */
 void tcp_update_metrics(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_metrics_block *tm;
+	unsigned long rtt;
+	u32 val;
+	int m;
 
-	if (sysctl_tcp_nometrics_save)
+	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst && (dst->flags & DST_HOST)) {
-		const struct inet_connection_sock *icsk = inet_csk(sk);
-		int m;
-		unsigned long rtt;
-
+	if (dst->flags & DST_HOST)
 		dst_confirm(dst);
 
-		if (icsk->icsk_backoff || !tp->srtt) {
-			/* This session failed to estimate rtt. Why?
-			 * Probably, no packets returned in time.
-			 * Reset our results.
-			 */
-			if (!(dst_metric_locked(dst, RTAX_RTT)))
-				dst_metric_set(dst, RTAX_RTT, 0);
-			return;
-		}
+	rcu_read_lock();
+	if (icsk->icsk_backoff || !tp->srtt) {
+		/* This session failed to estimate rtt. Why?
+		 * Probably, no packets returned in time.  Reset our
+		 * results.
+		 */
+		tm = tcp_get_metrics(sk, dst, false);
+		if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+			tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+		goto out_unlock;
+	} else
+		tm = tcp_get_metrics(sk, dst, true);
 
-		rtt = dst_metric_rtt(dst, RTAX_RTT);
-		m = rtt - tp->srtt;
+	if (!tm)
+		goto out_unlock;
 
-		/* If newly calculated rtt larger than stored one,
-		 * store new one. Otherwise, use EWMA. Remember,
-		 * rtt overestimation is always better than underestimation.
-		 */
-		if (!(dst_metric_locked(dst, RTAX_RTT))) {
-			if (m <= 0)
-				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
-			else
-				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
-		}
+	rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+	m = rtt - tp->srtt;
 
-		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
-			unsigned long var;
-			if (m < 0)
-				m = -m;
+	/* If newly calculated rtt larger than stored one, store new
+	 * one. Otherwise, use EWMA. Remember, rtt overestimation is
+	 * always better than underestimation.
+	 */
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+		if (m <= 0)
+			rtt = tp->srtt;
+		else
+			rtt -= (m >> 3);
+		tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+	}
 
-			/* Scale deviation to rttvar fixed point */
-			m >>= 1;
-			if (m < tp->mdev)
-				m = tp->mdev;
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+		unsigned long var;
 
-			var = dst_metric_rtt(dst, RTAX_RTTVAR);
-			if (m >= var)
-				var = m;
-			else
-				var -= (var - m) >> 2;
+		if (m < 0)
+			m = -m;
 
-			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
-		}
+		/* Scale deviation to rttvar fixed point */
+		m >>= 1;
+		if (m < tp->mdev)
+			m = tp->mdev;
 
-		if (tcp_in_initial_slowstart(tp)) {
-			/* Slow start still did not finish. */
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
-			if (!dst_metric_locked(dst, RTAX_CWND) &&
-			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
-		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   icsk->icsk_ca_state == TCP_CA_Open) {
-			/* Cong. avoidance phase, cwnd is reliable. */
-			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH,
-					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_cwnd) >> 1);
-		} else {
-			/* Else slow start did not finish, cwnd is non-sense,
-			   ssthresh may be also invalid.
-			 */
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_ssthresh) >> 1);
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
-		}
+		var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+		if (m >= var)
+			var = m;
+		else
+			var -= (var - m) >> 2;
 
-		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+		tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+	}
+
+	if (tcp_in_initial_slowstart(tp)) {
+		/* Slow start still did not finish. */
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && (tp->snd_cwnd >> 1) > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tp->snd_cwnd >> 1);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			if (tp->snd_cwnd > val)
+				tcp_metric_set(tm, TCP_METRIC_CWND,
+					       tp->snd_cwnd);
+		}
+	} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+		   icsk->icsk_ca_state == TCP_CA_Open) {
+		/* Cong. avoidance phase, cwnd is reliable. */
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+			tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+				       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
+		}
+	} else {
+		/* Else slow start did not finish, cwnd is non-sense,
+		 * ssthresh may be also invalid.
+		 */
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, TCP_METRIC_CWND,
+				       (val + tp->snd_ssthresh) >> 1);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && tp->snd_ssthresh > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tp->snd_ssthresh);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+			if (val < tp->reordering &&
 			    tp->reordering != sysctl_tcp_reordering)
-				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+				tcp_metric_set(tm, TCP_METRIC_REORDERING,
+					       tp->reordering);
 		}
 	}
+	tm->tcpm_stamp = jiffies;
+out_unlock:
+	rcu_read_unlock();
 }
 
 /* Initialize metrics on socket. */
 
 void tcp_init_metrics(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_metrics_block *tm;
+	u32 val;
 
 	if (dst == NULL)
 		goto reset;
 
 	dst_confirm(dst);
 
-	if (dst_metric_locked(dst, RTAX_CWND))
-		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
-	if (dst_metric(dst, RTAX_SSTHRESH)) {
-		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, true);
+	if (!tm) {
+		rcu_read_unlock();
+		goto reset;
+	}
+
+	if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+	val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+	if (val) {
+		tp->snd_ssthresh = val;
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 			tp->snd_ssthresh = tp->snd_cwnd_clamp;
 	} else {
@@ -137,16 +434,18 @@ void tcp_init_metrics(struct sock *sk)
 		 */
 		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
-	if (dst_metric(dst, RTAX_REORDERING) &&
-	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
 		tcp_disable_early_retrans(tp);
-		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+		tp->reordering = val;
 	}
 
-	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+	val = tcp_metric_get(tm, TCP_METRIC_RTT);
+	if (val == 0 || tp->srtt == 0) {
+		rcu_read_unlock();
 		goto reset;
-
+	}
 	/* Initial rtt is determined from SYN,SYN-ACK.
 	 * The segment is small and rtt may appear much
 	 * less than real one. Use per-dst memory
@@ -161,14 +460,18 @@ void tcp_init_metrics(struct sock *sk)
 	 * to low value, and then abruptly stops to do it and starts to delay
 	 * ACKs, wait for troubles.
 	 */
-	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
-		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+	val = msecs_to_jiffies(val);
+	if (val > tp->srtt) {
+		tp->srtt = val;
 		tp->rtt_seq = tp->snd_nxt;
 	}
-	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
-		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+	val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+	if (val > tp->mdev) {
+		tp->mdev = val;
 		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
 	}
+	rcu_read_unlock();
+
 	tcp_set_rto(sk);
 reset:
 	if (tp->srtt == 0) {
@@ -195,8 +498,74 @@ reset:
 
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 {
+	struct tcp_metrics_block *tm;
+	bool ret;
+
 	if (!dst)
 		return false;
-	return dst_metric(dst, RTAX_RTT) ? true : false;
+
+	rcu_read_lock();
+	tm = __tcp_get_metrics_req(req, dst);
+	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
+		ret = true;
+	else
+		ret = false;
+	rcu_read_unlock();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+static unsigned long tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtoul(str, 0, &tcpmhash_entries);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+	int slots, size;
+
+	slots = tcpmhash_entries;
+	if (!slots) {
+		if (totalram_pages >= 128 * 1024)
+			slots = 16 * 1024;
+		else
+			slots = 8 * 1024;
+	}
+
+	size = slots * sizeof(struct tcpm_hash_bucket);
+
+	net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+	if (!net->ipv4.tcp_metrics_hash)
+		return -ENOMEM;
+
+	net->ipv4.tcp_metrics_hash_mask = (slots - 1);
+
+	return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+	kfree(net->ipv4.tcp_metrics_hash);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+	.init	=	tcp_net_metrics_init,
+	.exit	=	tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+	register_pernet_subsys(&tcp_net_metrics_ops);
+}
-- 
cgit v1.2.3


From 794785bf12d5d6ad7f557d78d203bb0bbfcd8da2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 00:52:56 -0700
Subject: net: Don't report route RTT metric value in cache dumps.

We don't maintain it dynamically any longer, so reporting it would
be extremely misleading.  Report zero instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_route.c | 11 +++++------
 net/ipv4/route.c      | 22 ++++++++++------------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6e74b3f110bc..707027fae8ab 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1812,12 +1812,11 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
 	char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
 
 	seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
-			rt->dst.dev ? rt->dst.dev->name : "*",
-			dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
-			dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
-			atomic_read(&rt->dst.__refcnt),
-			rt->dst.__use,
-			(int) dst_metric(&rt->dst, RTAX_RTT));
+		   rt->dst.dev ? rt->dst.dev->name : "*",
+		   dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
+		   dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
+		   atomic_read(&rt->dst.__refcnt),
+		   rt->dst.__use, 0);
 	return 0;
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 72e88c208025..d02c91177d32 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -423,18 +423,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 		int len;
 
 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			r->dst.dev ? r->dst.dev->name : "*",
-			(__force u32)r->rt_dst,
-			(__force u32)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->dst.__refcnt),
-			r->dst.__use, 0, (__force u32)r->rt_src,
-			dst_metric_advmss(&r->dst) + 40,
-			dst_metric(&r->dst, RTAX_WINDOW),
-			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
-			      dst_metric(&r->dst, RTAX_RTTVAR)),
-			r->rt_key_tos,
-			-1, 0, 0, &len);
+			   "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+			   r->dst.dev ? r->dst.dev->name : "*",
+			   (__force u32)r->rt_dst,
+			   (__force u32)r->rt_gateway,
+			   r->rt_flags, atomic_read(&r->dst.__refcnt),
+			   r->dst.__use, 0, (__force u32)r->rt_src,
+			   dst_metric_advmss(&r->dst) + 40,
+			   dst_metric(&r->dst, RTAX_WINDOW), 0,
+			   r->rt_key_tos,
+			   -1, 0, 0, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
 	}
-- 
cgit v1.2.3


From 81166dd6fa8eb780b2132d32fbc77eb6ac04e44e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 03:14:24 -0700
Subject: tcp: Move timestamps from inetpeer to metrics cache.

With help from Lin Ming.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h   |   4 +-
 include/net/tcp.h        |   5 +-
 net/ipv4/inetpeer.c      |   1 -
 net/ipv4/route.c         |   8 +--
 net/ipv4/tcp_ipv4.c      |  30 ++---------
 net/ipv4/tcp_metrics.c   | 136 +++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_minisocks.c |  46 ----------------
 net/ipv6/route.c         |  13 +----
 net/ipv6/tcp_ipv6.c      |  33 ++----------
 9 files changed, 149 insertions(+), 127 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index c27c8f10ebdc..1119f6f6cdb4 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -46,15 +46,13 @@ struct inet_peer {
 	};
 	/*
 	 * Once inet_peer is queued for deletion (refcnt == -1), following fields
-	 * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp
+	 * are not available: rid, ip_id_count
 	 * We can share memory with rcu_head to help keep inet_peer small.
 	 */
 	union {
 		struct {
 			atomic_t			rid;		/* Frag reception counter */
 			atomic_t			ip_id_count;	/* IP ID for the next packet */
-			__u32				tcp_ts;
-			__u32				tcp_ts_stamp;
 		};
 		struct rcu_head         rcu;
 		struct inet_peer	*gc_next;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0900d63d1627..3618fefae049 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -390,7 +390,10 @@ extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
 extern void tcp_metrics_init(void);
-extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
+extern bool tcp_remember_stamp(struct sock *sk);
+extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da90a8cab614..f457bcb41350 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -508,7 +508,6 @@ relookup:
 				(daddr->family == AF_INET) ?
 					secure_ip_id(daddr->addr.a4) :
 					secure_ipv6_id(daddr->addr.a6));
-		p->tcp_ts_stamp = 0;
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
 		p->rate_last = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d02c91177d32..78d81543766d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2846,7 +2846,7 @@ static int rt_fill_info(struct net *net,
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
-	u32 id = 0, ts = 0, tsage = 0, error;
+	u32 id = 0, error;
 
 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
 	if (nlh == NULL)
@@ -2903,10 +2903,6 @@ static int rt_fill_info(struct net *net,
 		const struct inet_peer *peer = rt_peer_ptr(rt);
 		inet_peer_refcheck(peer);
 		id = atomic_read(&peer->ip_id_count) & 0xffff;
-		if (peer->tcp_ts_stamp) {
-			ts = peer->tcp_ts;
-			tsage = get_seconds() - peer->tcp_ts_stamp;
-		}
 		expires = ACCESS_ONCE(peer->pmtu_expires);
 		if (expires) {
 			if (time_before(jiffies, expires))
@@ -2942,7 +2938,7 @@ static int rt_fill_info(struct net *net,
 				goto nla_put_failure;
 	}
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, id, 0, 0,
 			       expires, error) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9312a8f33a1..d406bf7f37d9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 
 	if (tcp_death_row.sysctl_tw_recycle &&
-	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
-		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
-		/*
-		 * VJ's idea. We save last timestamp seen from
-		 * the destination in peer table, when entering state
-		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-		 * when trying new connection.
-		 */
-		if (peer) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-				tp->rx_opt.ts_recent = peer->tcp_ts;
-			}
-		}
-	}
+	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+		tcp_fetch_timewait_stamp(sk, &rt->dst);
 
 	inet->inet_dport = usin->sin_port;
 	inet->inet_daddr = daddr;
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
-		struct inet_peer *peer = NULL;
 		struct flowi4 fl4;
 
 		/* VJ's idea. We save last timestamp seen
@@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
 		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
-		    fl4.daddr == saddr &&
-		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-			    (s32)(peer->tcp_ts - req->ts_recent) >
-							TCP_PAWS_WINDOW) {
+		    fl4.daddr == saddr) {
+			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		else if (!sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
-			 (!peer || !peer->tcp_ts_stamp) &&
-			 !tcp_peer_is_proven(req, dst)) {
+			 !tcp_peer_is_proven(req, dst, false)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 56223bab251b..1fd83d3118fe 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -34,6 +34,8 @@ struct tcp_metrics_block {
 	struct tcp_metrics_block __rcu	*tcpm_next;
 	struct inetpeer_addr		tcpm_addr;
 	unsigned long			tcpm_stamp;
+	u32				tcpm_ts;
+	u32				tcpm_ts_stamp;
 	u32				tcpm_lock;
 	u32				tcpm_vals[TCP_METRIC_MAX];
 };
@@ -114,6 +116,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
 	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
 	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+	tm->tcpm_ts = 0;
+	tm->tcpm_ts_stamp = 0;
 }
 
 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -230,6 +234,45 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 	return tm;
 }
 
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+	struct inet6_timewait_sock *tw6;
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+
+	addr.family = tw->tw_family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = tw->tw_daddr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		tw6 = inet6_twsk((struct sock *)tw);
+		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = twsk_net(tw);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, &addr))
+			break;
+	}
+	return tm;
+}
+
 static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 						 struct dst_entry *dst,
 						 bool create)
@@ -496,7 +539,7 @@ reset:
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
 {
 	struct tcp_metrics_block *tm;
 	bool ret;
@@ -506,16 +549,99 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 
 	rcu_read_lock();
 	tm = __tcp_get_metrics_req(req, dst);
-	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
-		ret = true;
-	else
-		ret = false;
+	if (paws_check) {
+		if (tm &&
+		    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+		    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+			ret = false;
+		else
+			ret = true;
+	} else {
+		if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+			ret = true;
+		else
+			ret = false;
+	}
 	rcu_read_unlock();
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
 
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, true);
+	if (tm) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+			tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+			tp->rx_opt.ts_recent = tm->tcpm_ts;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	bool ret = false;
+
+	if (dst) {
+		struct tcp_metrics_block *tm;
+
+		rcu_read_lock();
+		tm = tcp_get_metrics(sk, dst, true);
+		if (tm) {
+			struct tcp_sock *tp = tcp_sk(sk);
+
+			if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+			    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+			     tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+				tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+				tm->tcpm_ts = tp->rx_opt.ts_recent;
+			}
+			ret = true;
+		}
+		rcu_read_unlock();
+	}
+	return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+	struct tcp_metrics_block *tm;
+	bool ret = false;
+
+	rcu_read_lock();
+	tm = __tcp_get_metrics_tw(tw);
+	if (tw) {
+		const struct tcp_timewait_sock *tcptw;
+		struct sock *sk = (struct sock *) tw;
+
+		tcptw = tcp_twsk(sk);
+		if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+		    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+		     tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+			tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+			tm->tcpm_ts	   = tcptw->tw_ts_recent;
+		}
+		ret = true;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static unsigned long tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 72b7c63b1a39..a51aa534dab1 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
 
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-static bool tcp_remember_stamp(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct inet_peer *peer;
-
-	peer = icsk->icsk_af_ops->get_peer(sk);
-	if (peer) {
-		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
-		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
-			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
-			peer->tcp_ts = tp->rx_opt.ts_recent;
-		}
-		return true;
-	}
-
-	return false;
-}
-
-static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
-	const struct tcp_timewait_sock *tcptw;
-	struct sock *sk = (struct sock *) tw;
-	struct inet_peer *peer;
-
-	tcptw = tcp_twsk(sk);
-	peer = tcptw->tw_peer;
-	if (peer) {
-		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
-		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
-			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
-			peer->tcp_ts	   = tcptw->tw_ts_recent;
-		}
-		return true;
-	}
-	return false;
-}
-
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6cc6c881f54f..0c0684753781 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net,
 			 int iif, int type, u32 pid, u32 seq,
 			 int prefix, int nowait, unsigned int flags)
 {
-	const struct inet_peer *peer;
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
 	long expires;
 	u32 table;
 	struct neighbour *n;
-	u32 ts, tsage;
 
 	if (prefix) {	/* user wants prefix routes only */
 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2473,16 +2471,7 @@ static int rt6_fill_node(struct net *net,
 	else
 		expires = INT_MAX;
 
-	peer = NULL;
-	if (rt6_has_peer(rt))
-		peer = rt6_peer_ptr(rt);
-	ts = tsage = 0;
-	if (peer && peer->tcp_ts_stamp) {
-		ts = peer->tcp_ts;
-		tsage = get_seconds() - peer->tcp_ts_stamp;
-	}
-
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
 			       expires, rt->dst.error) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 75d179555c28..9e96b5f21d2a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	rt = (struct rt6_info *) dst;
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
-	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) {
-		struct inet_peer *peer = rt6_get_peer(rt);
-		/*
-		 * VJ's idea. We save last timestamp seen from
-		 * the destination in peer table, when entering state
-		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-		 * when trying new connection.
-		 */
-		if (peer) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-				tp->rx_opt.ts_recent = peer->tcp_ts;
-			}
-		}
-	}
+	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr))
+		tcp_fetch_timewait_stamp(sk, dst);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
@@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		treq->iif = inet6_iif(skb);
 
 	if (!isn) {
-		struct inet_peer *peer = NULL;
-
 		if (ipv6_opt_accepted(sk, skb) ||
 		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL &&
-		    (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
-		    ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
-				    &treq->rmt_addr)) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-			    (s32)(peer->tcp_ts - req->ts_recent) >
-							TCP_PAWS_WINDOW) {
+		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
+			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		else if (!sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
-			 (!peer || !peer->tcp_ts_stamp) &&
-			 !tcp_peer_is_proven(req, dst)) {
+			 !tcp_peer_is_proven(req, dst, false)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
-- 
cgit v1.2.3


From b6242b9b45e84ef71c59002cd128c3197938cb2f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 03:27:56 -0700
Subject: tcp: Remove tw->tw_peer

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 -
 net/ipv4/tcp_minisocks.c | 16 ++--------------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7d3bcedc062a..2de9cf46f9fc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,7 +506,6 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_recent;
 	long			  tw_ts_recent_stamp;
-	struct inet_peer	  *tw_peer;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a51aa534dab1..65608863fdee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -267,12 +267,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	bool recycle_ok = false;
-	bool recycle_on = false;
 
-	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) {
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tcp_remember_stamp(sk);
-		recycle_on = true;
-	}
 
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
@@ -281,7 +278,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 		struct inet_sock *inet = inet_sk(sk);
-		struct inet_peer *peer = NULL;
 
 		tw->tw_transparent	= inet->transparent;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
@@ -305,12 +301,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		}
 #endif
 
-		if (recycle_on)
-			peer = icsk->icsk_af_ops->get_peer(sk);
-		tcptw->tw_peer = peer;
-		if (peer)
-			atomic_inc(&peer->refcnt);
-
 #ifdef CONFIG_TCP_MD5SIG
 		/*
 		 * The timewait bucket does not have the key DB from the
@@ -362,11 +352,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 
 void tcp_twsk_destructor(struct sock *sk)
 {
+#ifdef CONFIG_TCP_MD5SIG
 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 
-	if (twsk->tw_peer)
-		inet_putpeer(twsk->tw_peer);
-#ifdef CONFIG_TCP_MD5SIG
 	if (twsk->tw_md5_key) {
 		tcp_free_md5sig_pool();
 		kfree_rcu(twsk->tw_md5_key, rcu);
-- 
cgit v1.2.3


From 16d1839907e695387654901995f9286b65fbbc6a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 03:32:59 -0700
Subject: inet: Remove ->get_peer() method.

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  1 -
 net/ipv4/tcp_ipv4.c                | 16 ----------------
 net/ipv6/tcp_ipv6.c                | 16 ----------------
 3 files changed, 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index af3c743a40e4..291e7cee14e7 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops {
 	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req,
 				      struct dst_entry *dst);
-	struct inet_peer *(*get_peer)(struct sock *sk);
 	u16	    net_header_len;
 	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d406bf7f37d9..ddefd39ac0cf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1847,21 +1847,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-struct inet_peer *tcp_v4_get_peer(struct sock *sk)
-{
-	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
-	struct inet_sock *inet = inet_sk(sk);
-
-	/* If we don't have a valid cached route, or we're doing IP
-	 * options which make the IPv4 header destination address
-	 * different from our peer's, do not bother with this.
-	 */
-	if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
-		return NULL;
-	return rt_get_peer_create(rt, inet->inet_daddr);
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
@@ -1874,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.rebuild_header	   = inet_sk_rebuild_header,
 	.conn_request	   = tcp_v4_conn_request,
 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
-	.get_peer	   = tcp_v4_get_peer,
 	.net_header_len	   = sizeof(struct iphdr),
 	.setsockopt	   = ip_setsockopt,
 	.getsockopt	   = ip_getsockopt,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9e96b5f21d2a..61175cb2478f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1689,20 +1689,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
-{
-	struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-
-	/* If we don't have a valid cached route, or we're doing IP
-	 * options which make the IPv6 header destination address
-	 * different from our peer's, do not bother with this.
-	 */
-	if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr))
-		return NULL;
-	return rt6_get_peer_create(rt);
-}
-
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
@@ -1715,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.rebuild_header	   = inet6_sk_rebuild_header,
 	.conn_request	   = tcp_v6_conn_request,
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
-	.get_peer	   = tcp_v6_get_peer,
 	.net_header_len	   = sizeof(struct ipv6hdr),
 	.net_frag_header_len = sizeof(struct frag_hdr),
 	.setsockopt	   = ipv6_setsockopt,
@@ -1747,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.rebuild_header	   = inet_sk_rebuild_header,
 	.conn_request	   = tcp_v6_conn_request,
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
-	.get_peer	   = tcp_v4_get_peer,
 	.net_header_len	   = sizeof(struct iphdr),
 	.setsockopt	   = ipv6_setsockopt,
 	.getsockopt	   = ipv6_getsockopt,
-- 
cgit v1.2.3


From 1d861aa4b3fb08822055345f480850205ffe6170 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 03:58:16 -0700
Subject: inet: Minimize use of cached route inetpeer.

Only use it in the absolutely required cases:

1) COW'ing metrics

2) ipv4 PMTU

3) ipv4 redirects

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c       |  3 ++-
 net/ipv4/route.c      | 32 ++++++++++++++++----------------
 net/ipv6/icmp.c       |  4 +++-
 net/ipv6/ip6_output.c | 10 ++++++++--
 net/ipv6/ndisc.c      |  8 ++++++--
 5 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4bce5a2830aa..4a049449305f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -254,9 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 
 	/* Limit if icmp type is enabled in ratemask. */
 	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-		struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr);
+		struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
 		rc = inet_peer_xrlim_allow(peer,
 					   net->ipv4.sysctl_icmp_ratelimit);
+		inet_putpeer(peer);
 	}
 out:
 	return rc;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 78d81543766d..e376354dcb65 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1289,20 +1289,15 @@ static void ip_select_fb_ident(struct iphdr *iph)
 
 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 {
-	struct rtable *rt = (struct rtable *) dst;
-
-	if (rt && !(rt->dst.flags & DST_NOPEER)) {
-		struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
+	struct net *net = dev_net(dst->dev);
+	struct inet_peer *peer;
 
-		/* If peer is attached to destination, it is never detached,
-		   so that we need not to grab a lock to dereference it.
-		 */
-		if (peer) {
-			iph->id = htons(inet_getid(peer, more));
-			return;
-		}
-	} else if (!rt)
-		pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
+	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
+	if (peer) {
+		iph->id = htons(inet_getid(peer, more));
+		inet_putpeer(peer);
+		return;
+	}
 
 	ip_select_fb_ident(iph);
 }
@@ -1492,6 +1487,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	struct rtable *rt = skb_rtable(skb);
 	struct in_device *in_dev;
 	struct inet_peer *peer;
+	struct net *net;
 	int log_martians;
 
 	rcu_read_lock();
@@ -1503,7 +1499,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 	rcu_read_unlock();
 
-	peer = rt_get_peer_create(rt, rt->rt_dst);
+	net = dev_net(rt->dst.dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 	if (!peer) {
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 		return;
@@ -1520,7 +1517,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	 */
 	if (peer->rate_tokens >= ip_rt_redirect_number) {
 		peer->rate_last = jiffies;
-		return;
+		goto out_put_peer;
 	}
 
 	/* Check for load limit; set rate_last to the latest sent
@@ -1541,6 +1538,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 					     &rt->rt_dst, &rt->rt_gateway);
 #endif
 	}
+out_put_peer:
+	inet_putpeer(peer);
 }
 
 static int ip_error(struct sk_buff *skb)
@@ -1583,7 +1582,7 @@ static int ip_error(struct sk_buff *skb)
 		break;
 	}
 
-	peer = rt_get_peer_create(rt, rt->rt_dst);
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 
 	send = true;
 	if (peer) {
@@ -1596,6 +1595,7 @@ static int ip_error(struct sk_buff *skb)
 			peer->rate_tokens -= ip_rt_error_cost;
 		else
 			send = false;
+		inet_putpeer(peer);
 	}
 	if (send)
 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c7da1422cbde..a113f7d7e938 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -194,8 +194,10 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		peer = rt6_get_peer_create(rt);
+		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c6af5963a202..5b2d63ed793e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -466,13 +466,15 @@ int ip6_forward(struct sk_buff *skb)
 		else
 			target = &hdr->daddr;
 
-		peer = rt6_get_peer_create(rt);
+		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 
 		/* Limit redirects both by destination (here)
 		   and by source (inside ndisc_send_redirect)
 		 */
 		if (inet_peer_xrlim_allow(peer, 1*HZ))
 			ndisc_send_redirect(skb, target);
+		if (peer)
+			inet_putpeer(peer);
 	} else {
 		int addrtype = ipv6_addr_type(&hdr->saddr);
 
@@ -592,10 +594,14 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 	int old, new;
 
 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
-		struct inet_peer *peer = rt6_get_peer_create(rt);
+		struct inet_peer *peer;
+		struct net *net;
 
+		net = dev_net(rt->dst.dev);
+		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 		if (peer) {
 			fhdr->identification = htonl(inet_getid(peer, 0));
+			inet_putpeer(peer);
 			return;
 		}
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 69a6330dea91..0fddd571400d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1486,6 +1486,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	int rd_len;
 	int err;
 	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+	bool ret;
 
 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
 		ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
@@ -1519,8 +1520,11 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 			  "Redirect: destination is not a neighbour\n");
 		goto release;
 	}
-	peer = rt6_get_peer_create(rt);
-	if (!inet_peer_xrlim_allow(peer, 1*HZ))
+	peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+	ret = inet_peer_xrlim_allow(peer, 1*HZ);
+	if (peer)
+		inet_putpeer(peer);
+	if (!ret)
 		goto release;
 
 	if (dev->addr_len) {
-- 
cgit v1.2.3


From 3e12939a2a67fbb4cbd962c3b9bc398c73319766 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 04:01:57 -0700
Subject: inet: Kill FLOWI_FLAG_PRECOW_METRICS.

No longer needed.  TCP writes metrics, but now in it's own special
cache that does not dirty the route metrics.  Therefore there is no
longer any reason to pre-cow metrics in this way.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h              |  5 ++---
 include/net/inet_sock.h         |  2 --
 include/net/route.h             |  2 --
 net/ipv4/inet_connection_sock.c |  2 +-
 net/ipv4/route.c                | 11 ++---------
 net/ipv6/route.c                |  2 +-
 6 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/flow.h b/include/net/flow.h
index bd524f598561..ce9cb7656b47 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -20,9 +20,8 @@ struct flowi_common {
 	__u8	flowic_proto;
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
-#define FLOWI_FLAG_PRECOW_METRICS	0x02
-#define FLOWI_FLAG_CAN_SLEEP		0x04
-#define FLOWI_FLAG_RT_NOCACHE		0x08
+#define FLOWI_FLAG_CAN_SLEEP		0x02
+#define FLOWI_FLAG_RT_NOCACHE		0x04
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ae17e1352d7e..924d7b98ab60 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 
 	if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
 		flags |= FLOWI_FLAG_ANYSRC;
-	if (sk->sk_protocol == IPPROTO_TCP)
-		flags |= FLOWI_FLAG_PRECOW_METRICS;
 	return flags;
 }
 
diff --git a/include/net/route.h b/include/net/route.h
index 211e2665139b..635d7a99d199 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -278,8 +278,6 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
-	if (protocol == IPPROTO_TCP)
-		flow_flags |= FLOWI_FLAG_PRECOW_METRICS;
 	if (can_sleep)
 		flow_flags |= FLOWI_FLAG_CAN_SLEEP;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 034ddbe42adf..76825be3b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -375,7 +375,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct ip_options_rcu *opt = inet_rsk(req)->opt;
 	struct net *net = sock_net(sk);
-	int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS;
+	int flags = inet_sk_flowi_flags(sk);
 
 	if (nocache)
 		flags |= FLOWI_FLAG_RT_NOCACHE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e376354dcb65..d4834e2914a0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1658,7 +1658,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 	struct rtable *rt;
 
 	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
+			   protocol, flow_flags,
 			   iph->daddr, iph->saddr, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
@@ -1836,18 +1836,11 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 {
 	struct inet_peer_base *base;
 	struct inet_peer *peer;
-	int create = 0;
-
-	/* If a peer entry exists for this destination, we must hook
-	 * it up in order to get at cached metrics.
-	 */
-	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
-		create = 1;
 
 	base = inetpeer_base_ptr(rt->_peer);
 	BUG_ON(!base);
 
-	peer = inet_getpeer_v4(base, rt->rt_dst, create);
+	peer = inet_getpeer_v4(base, rt->rt_dst, 0);
 	if (peer) {
 		__rt_set_peer(rt, peer);
 		rt->rt_peer_genid = rt_peer_genid();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0c0684753781..b7eb51e1a0e1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1093,7 +1093,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = oif;
 	fl6.flowi6_mark = mark;
-	fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
+	fl6.flowi6_flags = 0;
 	fl6.daddr = iph->daddr;
 	fl6.saddr = iph->saddr;
 	fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
-- 
cgit v1.2.3


From 87a50699cb6d169591cc776fb82683a2c77cecac Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 05:06:14 -0700
Subject: rtnetlink: Remove ts/tsage args to rtnl_put_cacheinfo().

Nobody provides non-zero values any longer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 3 +--
 net/core/rtnetlink.c      | 4 +---
 net/decnet/dn_route.c     | 2 +-
 net/ipv4/route.c          | 3 +--
 net/ipv6/route.c          | 3 +--
 5 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ea60b0854109..db71c4ad8624 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
-			      u32 id, u32 ts, u32 tsage, long expires,
-			      u32 error);
+			      u32 id, long expires, u32 error);
 
 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2b325c340b44..64127eee786d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -615,7 +615,7 @@ nla_put_failure:
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 
 int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
-		       u32 ts, u32 tsage, long expires, u32 error)
+		       long expires, u32 error)
 {
 	struct rta_cacheinfo ci = {
 		.rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
@@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 		.rta_clntref = atomic_read(&(dst->__refcnt)),
 		.rta_error = error,
 		.rta_id =  id,
-		.rta_ts = ts,
-		.rta_tsage = tsage,
 	};
 
 	if (expires)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 707027fae8ab..b5594cc73ee1 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		goto errout;
 
 	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
 			       rt->dst.error) < 0)
 		goto errout;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d4834e2914a0..67b08745daf9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2931,8 +2931,7 @@ static int rt_fill_info(struct net *net,
 				goto nla_put_failure;
 	}
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, id, 0, 0,
-			       expires, error) < 0)
+	if (rtnl_put_cacheinfo(skb, &rt->dst, id, expires, error) < 0)
 		goto nla_put_failure;
 
 	return nlmsg_end(skb, nlh);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b7eb51e1a0e1..563f12c1c99c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2471,8 +2471,7 @@ static int rt6_fill_node(struct net *net,
 	else
 		expires = INT_MAX;
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
-			       expires, rt->dst.error) < 0)
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
 		goto nla_put_failure;
 
 	return nlmsg_end(skb, nlh);
-- 
cgit v1.2.3


From 5943634fc5592037db0693b261f7f4bea6bb9457 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 06:58:42 -0700
Subject: ipv4: Maintain redirect and PMTU info in struct rtable again.

Maintaining this in the inetpeer entries was not the right way to do
this at all.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h  |   4 --
 include/net/route.h     |   2 +-
 net/ipv4/inetpeer.c     |   3 -
 net/ipv4/route.c        | 185 ++++++++++--------------------------------------
 net/ipv4/xfrm4_policy.c |   1 +
 5 files changed, 41 insertions(+), 154 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 1119f6f6cdb4..53f464d7cddc 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -36,10 +36,6 @@ struct inet_peer {
 	u32			metrics[RTAX_MAX];
 	u32			rate_tokens;	/* rate limiting for ICMP */
 	unsigned long		rate_last;
-	unsigned long		pmtu_expires;
-	u32			pmtu_orig;
-	u32			pmtu_learned;
-	struct inetpeer_addr_base redirect_learned;
 	union {
 		struct list_head	gc_list;
 		struct rcu_head     gc_rcu;
diff --git a/include/net/route.h b/include/net/route.h
index 635d7a99d199..c27449466d18 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -65,7 +65,7 @@ struct rtable {
 	__be32			rt_gateway;
 
 	/* Miscellaneous cached information */
-	u32			rt_peer_genid;
+	u32			rt_pmtu;
 	unsigned long		_peer; /* long-living peer info */
 	struct fib_info		*fi; /* for client ref to shared metrics */
 };
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index f457bcb41350..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -511,9 +511,6 @@ relookup:
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
 		p->rate_last = 0;
-		p->pmtu_expires = 0;
-		p->pmtu_orig = 0;
-		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
 		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 67b08745daf9..677d65253e4c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -669,7 +669,7 @@ static inline int rt_fast_clean(struct rtable *rth)
 static inline int rt_valuable(struct rtable *rth)
 {
 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		(rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
+		rth->dst.expires;
 }
 
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -1242,13 +1242,6 @@ skip_hashing:
 	return rt;
 }
 
-static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt_peer_genid(void)
-{
-	return atomic_read(&__rt_peer_genid);
-}
-
 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 {
 	struct inet_peer_base *base;
@@ -1262,8 +1255,6 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 	if (peer) {
 		if (!rt_set_peer(rt, peer))
 			inet_putpeer(peer);
-		else
-			rt->rt_peer_genid = rt_peer_genid();
 	}
 }
 
@@ -1323,30 +1314,6 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
-	struct rtable *rt = (struct rtable *) dst;
-	__be32 orig_gw = rt->rt_gateway;
-	struct neighbour *n;
-
-	dst_confirm(&rt->dst);
-
-	rt->rt_gateway = peer->redirect_learned.a4;
-
-	n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
-	if (!n) {
-		rt->rt_gateway = orig_gw;
-		return;
-	}
-	if (!(n->nud_state & NUD_VALID)) {
-		neigh_event_send(n, NULL);
-	} else {
-		rt->rt_flags |= RTCF_REDIRECTED;
-		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
-	}
-	neigh_release(n);
-}
-
 /* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
@@ -1355,7 +1322,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	__be32 skeys[2] = { saddr, 0 };
 	int    ikeys[2] = { dev->ifindex, 0 };
-	struct inet_peer *peer;
 	struct net *net;
 
 	if (!in_dev)
@@ -1388,6 +1354,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			rthp = &rt_hash_table[hash].chain;
 
 			while ((rt = rcu_dereference(*rthp)) != NULL) {
+				struct neighbour *n;
+
 				rthp = &rt->dst.rt_next;
 
 				if (rt->rt_key_dst != daddr ||
@@ -1401,13 +1369,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				    rt->rt_gateway != old_gw)
 					continue;
 
-				peer = rt_get_peer_create(rt, rt->rt_dst);
-				if (peer) {
-					if (peer->redirect_learned.a4 != new_gw) {
-						peer->redirect_learned.a4 = new_gw;
-						atomic_inc(&__rt_peer_genid);
+				n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+				if (n) {
+					if (!(n->nud_state & NUD_VALID)) {
+						neigh_event_send(n, NULL);
+					} else {
+						rt->rt_gateway = new_gw;
+						rt->rt_flags |= RTCF_REDIRECTED;
+						call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 					}
-					check_peer_redir(&rt->dst, peer);
+					neigh_release(n);
 				}
 			}
 		}
@@ -1425,23 +1396,6 @@ reject_redirect:
 	;
 }
 
-static bool peer_pmtu_expired(struct inet_peer *peer)
-{
-	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
-	return orig &&
-	       time_after_eq(jiffies, orig) &&
-	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
-static bool peer_pmtu_cleaned(struct inet_peer *peer)
-{
-	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
-	return orig &&
-	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *)dst;
@@ -1451,16 +1405,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 		if (dst->obsolete > 0) {
 			ip_rt_put(rt);
 			ret = NULL;
-		} else if (rt->rt_flags & RTCF_REDIRECTED) {
+		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+			   rt->dst.expires) {
 			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
 						rt->rt_oif,
 						rt_genid(dev_net(dst->dev)));
 			rt_del(hash, rt);
 			ret = NULL;
-		} else if (rt_has_peer(rt)) {
-			struct inet_peer *peer = rt_peer_ptr(rt);
-			if (peer_pmtu_expired(peer))
-				dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
 		}
 	}
 	return ret;
@@ -1604,50 +1555,17 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
-{
-	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
-
-	if (!expires)
-		return;
-	if (time_before(jiffies, expires)) {
-		u32 orig_dst_mtu = dst_mtu(dst);
-		if (peer->pmtu_learned < orig_dst_mtu) {
-			if (!peer->pmtu_orig)
-				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
-			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
-		}
-	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
-		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
-}
-
 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 	struct rtable *rt = (struct rtable *) dst;
-	struct inet_peer *peer;
 
 	dst_confirm(dst);
 
-	peer = rt_get_peer_create(rt, rt->rt_dst);
-	if (peer) {
-		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-
-		if (mtu < ip_rt_min_pmtu)
-			mtu = ip_rt_min_pmtu;
-		if (!pmtu_expires || mtu < peer->pmtu_learned) {
-
-			pmtu_expires = jiffies + ip_rt_mtu_expires;
-			if (!pmtu_expires)
-				pmtu_expires = 1UL;
-
-			peer->pmtu_learned = mtu;
-			peer->pmtu_expires = pmtu_expires;
+	if (mtu < ip_rt_min_pmtu)
+		mtu = ip_rt_min_pmtu;
 
-			atomic_inc(&__rt_peer_genid);
-			rt->rt_peer_genid = rt_peer_genid();
-		}
-		check_peer_pmtu(dst, peer);
-	}
+	rt->rt_pmtu = mtu;
+	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1679,30 +1597,12 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
-static void ipv4_validate_peer(struct rtable *rt)
-{
-	if (rt->rt_peer_genid != rt_peer_genid()) {
-		struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
-
-		if (peer) {
-			check_peer_pmtu(&rt->dst, peer);
-
-			if (peer->redirect_learned.a4 &&
-			    peer->redirect_learned.a4 != rt->rt_gateway)
-				check_peer_redir(&rt->dst, peer);
-		}
-
-		rt->rt_peer_genid = rt_peer_genid();
-	}
-}
-
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
 	if (rt_is_expired(rt))
 		return NULL;
-	ipv4_validate_peer(rt);
 	return dst;
 }
 
@@ -1728,11 +1628,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
 	rt = skb_rtable(skb);
-	if (rt && rt_has_peer(rt)) {
-		struct inet_peer *peer = rt_peer_ptr(rt);
-		if (peer_pmtu_cleaned(peer))
-			dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
-	}
+	if (rt)
+		dst_set_expires(&rt->dst, 0);
 }
 
 static int ip_rt_bug(struct sk_buff *skb)
@@ -1812,7 +1709,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 static unsigned int ipv4_mtu(const struct dst_entry *dst)
 {
 	const struct rtable *rt = (const struct rtable *) dst;
-	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+	unsigned int mtu = rt->rt_pmtu;
+
+	if (mtu && time_after_eq(jiffies, rt->dst.expires))
+		mtu = 0;
+
+	if (!mtu)
+		mtu = dst_metric_raw(dst, RTAX_MTU);
 
 	if (mtu && rt_is_output_route(rt))
 		return mtu;
@@ -1843,19 +1746,10 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	peer = inet_getpeer_v4(base, rt->rt_dst, 0);
 	if (peer) {
 		__rt_set_peer(rt, peer);
-		rt->rt_peer_genid = rt_peer_genid();
 		if (inet_metrics_new(peer))
 			memcpy(peer->metrics, fi->fib_metrics,
 			       sizeof(u32) * RTAX_MAX);
 		dst_init_metrics(&rt->dst, peer->metrics, false);
-
-		check_peer_pmtu(&rt->dst, peer);
-
-		if (peer->redirect_learned.a4 &&
-		    peer->redirect_learned.a4 != rt->rt_gateway) {
-			rt->rt_gateway = peer->redirect_learned.a4;
-			rt->rt_flags |= RTCF_REDIRECTED;
-		}
 	} else {
 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
 			rt->fi = fi;
@@ -1955,8 +1849,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_mark    = skb->mark;
+	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, dev_net(dev)->ipv4.peers);
 	rth->fi = NULL;
 	if (our) {
@@ -2081,8 +1975,8 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
 	rth->rt_mark    = skb->mark;
+	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, &res->table->tb_peers);
 	rth->fi = NULL;
 
@@ -2260,8 +2154,8 @@ local_input:
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_mark    = skb->mark;
+	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, net->ipv4.peers);
 	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
@@ -2337,7 +2231,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		    rth->rt_mark == skb->mark &&
 		    net_eq(dev_net(rth->dst.dev), net) &&
 		    !rt_is_expired(rth)) {
-			ipv4_validate_peer(rth);
 			if (noref) {
 				dst_use_noref(&rth->dst, jiffies);
 				skb_dst_set_noref(skb, &rth->dst);
@@ -2459,8 +2352,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
 	rth->rt_mark    = fl4->flowi4_mark;
+	rth->rt_pmtu	= 0;
 	rth->rt_gateway = fl4->daddr;
-	rth->rt_peer_genid = 0;
 	rt_init_peer(rth, (res->table ?
 			   &res->table->tb_peers :
 			   dev_net(dev_out)->ipv4.peers));
@@ -2717,7 +2610,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
 		    net_eq(dev_net(rth->dst.dev), net) &&
 		    !rt_is_expired(rth)) {
-			ipv4_validate_peer(rth);
 			dst_use(&rth->dst, jiffies);
 			RT_CACHE_STAT_INC(out_hit);
 			rcu_read_unlock_bh();
@@ -2794,6 +2686,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
 		rt->rt_mark = ort->rt_mark;
+		rt->rt_pmtu = ort->rt_pmtu;
 
 		rt->rt_genid = rt_genid(net);
 		rt->rt_flags = ort->rt_flags;
@@ -2896,13 +2789,13 @@ static int rt_fill_info(struct net *net,
 		const struct inet_peer *peer = rt_peer_ptr(rt);
 		inet_peer_refcheck(peer);
 		id = atomic_read(&peer->ip_id_count) & 0xffff;
-		expires = ACCESS_ONCE(peer->pmtu_expires);
-		if (expires) {
-			if (time_before(jiffies, expires))
-				expires -= jiffies;
-			else
-				expires = 0;
-		}
+	}
+	expires = rt->dst.expires;
+	if (expires) {
+		if (time_before(jiffies, expires))
+			expires -= jiffies;
+		else
+			expires = 0;
 	}
 
 	if (rt_is_input_route(rt)) {
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9815ea0bca7f..951bcf35b21c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,6 +100,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_src = rt->rt_src;
 	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
+	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 710ab6c03122cf464510f8c86eb0a179e80b2d61 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 07:02:09 -0700
Subject: ipv4: Enforce max MTU metric at route insertion time.

Rather than at every struct rtable creation.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 ++
 net/ipv4/route.c         | 7 +------
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ae301c897a19..d71bfbdc0bf4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 				val = nla_get_u32(nla);
 				if (type == RTAX_ADVMSS && val > 65535 - 40)
 					val = 65535 - 40;
+				if (type == RTAX_MTU && val > 65535 - 15)
+					val = 65535 - 15;
 				fi->fib_metrics[type - 1] = val;
 			}
 		}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 677d65253e4c..1678b575165b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1763,21 +1763,16 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 			   const struct fib_result *res,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
-	struct dst_entry *dst = &rt->dst;
-
 	if (fi) {
 		if (FIB_RES_GW(*res) &&
 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = FIB_RES_GW(*res);
 		rt_init_metrics(rt, fl4, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
+		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
 	}
 
-	if (dst_mtu(dst) > IP_MAX_MTU)
-		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	set_class_tag(rt, fib_rules_tclass(res));
-- 
cgit v1.2.3


From 2db2d67e4cf6100249bad575d9c13c16fd7b06dc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 07:03:43 -0700
Subject: ipv4: Kill dst_copy_metrics() call from ipv4_blackhole_route().

Blackhole routes have a COW metrics operation that returns NULL
always, therefore this dst_copy_metrics() call did absolutely
nothing.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1678b575165b..a967df54a423 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2668,7 +2668,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		new->__use = 1;
 		new->input = dst_discard;
 		new->output = dst_discard;
-		dst_copy_metrics(new, &ort->dst);
 
 		new->dev = ort->dst.dev;
 		if (new->dev)
-- 
cgit v1.2.3


From 312487313d4f7177cb751830e3d9c218e42ed59e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 07:08:18 -0700
Subject: ipv4: Calling ->cow_metrics() now is a bug.

Nothing every writes to ipv4 metrics any longer.

PMTU is stored in rt->rt_pmtu.

Dynamic TCP metrics are stored in a special TCP metrics cache,
completely outside of the routes.

Therefore ->cow_metrics() can simply nothing more than a WARN_ON
trigger so we can catch anyone who tries to add new writes to
ipv4 route metrics.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 30 ++----------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a967df54a423..9cc00f8a6ee5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -158,34 +158,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-	struct rtable *rt = (struct rtable *) dst;
-	struct inet_peer *peer;
-	u32 *p = NULL;
-
-	peer = rt_get_peer_create(rt, rt->rt_dst);
-	if (peer) {
-		u32 *old_p = __DST_METRICS_PTR(old);
-		unsigned long prev, new;
-
-		p = peer->metrics;
-		if (inet_metrics_new(peer))
-			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
-		new = (unsigned long) p;
-		prev = cmpxchg(&dst->_metrics, old, new);
-
-		if (prev != old) {
-			p = __DST_METRICS_PTR(prev);
-			if (prev & DST_METRICS_READ_ONLY)
-				p = NULL;
-		} else {
-			if (rt->fi) {
-				fib_info_put(rt->fi);
-				rt->fi = NULL;
-			}
-		}
-	}
-	return p;
+	WARN_ON(1);
+	return NULL;
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
-- 
cgit v1.2.3


From f185071ddf799e194ba015d040d3d49cdbfa7e48 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 07:26:01 -0700
Subject: ipv4: Remove inetpeer from routes.

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     | 57 ----------------------------------------------
 net/ipv4/route.c        | 60 +++++--------------------------------------------
 net/ipv4/xfrm4_policy.c |  7 ------
 3 files changed, 6 insertions(+), 118 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index c27449466d18..52362368af09 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -40,7 +40,6 @@
 #define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
 struct fib_nh;
-struct inet_peer;
 struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
@@ -66,44 +65,9 @@ struct rtable {
 
 	/* Miscellaneous cached information */
 	u32			rt_pmtu;
-	unsigned long		_peer; /* long-living peer info */
 	struct fib_info		*fi; /* for client ref to shared metrics */
 };
 
-static inline struct inet_peer *rt_peer_ptr(struct rtable *rt)
-{
-	return inetpeer_ptr(rt->_peer);
-}
-
-static inline bool rt_has_peer(struct rtable *rt)
-{
-	return inetpeer_ptr_is_peer(rt->_peer);
-}
-
-static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
-	__inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
-	return inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base)
-{
-	inetpeer_init_ptr(&rt->_peer, base);
-}
-
-static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort)
-{
-	rt->_peer = ort->_peer;
-	if (rt_has_peer(ort)) {
-		struct inet_peer *peer = rt_peer_ptr(ort);
-		atomic_inc(&peer->refcnt);
-	}
-}
-
 static inline bool rt_is_input_route(const struct rtable *rt)
 {
 	return rt->rt_route_iif != 0;
@@ -326,27 +290,6 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
 	return rt;
 }
 
-extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
-
-static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
-{
-	if (rt_has_peer(rt))
-		return rt_peer_ptr(rt);
-
-	rt_bind_peer(rt, daddr, create);
-	return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL);
-}
-
-static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
-{
-	return __rt_get_peer(rt, daddr, 0);
-}
-
-static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr)
-{
-	return __rt_get_peer(rt, daddr, 1);
-}
-
 static inline int inet_iif(const struct sk_buff *skb)
 {
 	return skb_rtable(skb)->rt_iif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9cc00f8a6ee5..95bfa1ba5b28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -889,7 +889,6 @@ static void rt_cache_invalidate(struct net *net)
 
 	get_random_bytes(&shuffle, sizeof(shuffle));
 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-	inetpeer_invalidate_family(AF_INET);
 }
 
 /*
@@ -1216,22 +1215,6 @@ skip_hashing:
 	return rt;
 }
 
-void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-{
-	struct inet_peer_base *base;
-	struct inet_peer *peer;
-
-	base = inetpeer_base_ptr(rt->_peer);
-	if (!base)
-		return;
-
-	peer = inet_getpeer_v4(base, daddr, create);
-	if (peer) {
-		if (!rt_set_peer(rt, peer))
-			inet_putpeer(peer);
-	}
-}
-
 /*
  * Peer allocation may fail only in serious out-of-memory conditions.  However
  * we still can generate some output.
@@ -1588,10 +1571,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 		fib_info_put(rt->fi);
 		rt->fi = NULL;
 	}
-	if (rt_has_peer(rt)) {
-		struct inet_peer *peer = rt_peer_ptr(rt);
-		inet_putpeer(peer);
-	}
 }
 
 
@@ -1711,26 +1690,11 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 			    struct fib_info *fi)
 {
-	struct inet_peer_base *base;
-	struct inet_peer *peer;
-
-	base = inetpeer_base_ptr(rt->_peer);
-	BUG_ON(!base);
-
-	peer = inet_getpeer_v4(base, rt->rt_dst, 0);
-	if (peer) {
-		__rt_set_peer(rt, peer);
-		if (inet_metrics_new(peer))
-			memcpy(peer->metrics, fi->fib_metrics,
-			       sizeof(u32) * RTAX_MAX);
-		dst_init_metrics(&rt->dst, peer->metrics, false);
-	} else {
-		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
-			rt->fi = fi;
-			atomic_inc(&fi->fib_clntref);
-		}
-		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+	if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+		rt->fi = fi;
+		atomic_inc(&fi->fib_clntref);
 	}
+	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
@@ -1820,7 +1784,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rt_init_peer(rth, dev_net(dev)->ipv4.peers);
 	rth->fi = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
@@ -1946,7 +1909,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rt_init_peer(rth, &res->table->tb_peers);
 	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
@@ -2125,7 +2087,6 @@ local_input:
 	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
-	rt_init_peer(rth, net->ipv4.peers);
 	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -2323,9 +2284,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_mark    = fl4->flowi4_mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = fl4->daddr;
-	rt_init_peer(rth, (res->table ?
-			   &res->table->tb_peers :
-			   dev_net(dev_out)->ipv4.peers));
 	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2662,7 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_dst = ort->rt_dst;
 		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
-		rt_transfer_peer(rt, ort);
 		rt->fi = ort->fi;
 		if (rt->fi)
 			atomic_inc(&rt->fi->fib_clntref);
@@ -2700,7 +2657,7 @@ static int rt_fill_info(struct net *net,
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
-	u32 id = 0, error;
+	u32 error;
 
 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
 	if (nlh == NULL)
@@ -2753,11 +2710,6 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 
 	error = rt->dst.error;
-	if (rt_has_peer(rt)) {
-		const struct inet_peer *peer = rt_peer_ptr(rt);
-		inet_peer_refcheck(peer);
-		id = atomic_read(&peer->ip_id_count) & 0xffff;
-	}
 	expires = rt->dst.expires;
 	if (expires) {
 		if (time_before(jiffies, expires))
@@ -2792,7 +2744,7 @@ static int rt_fill_info(struct net *net,
 				goto nla_put_failure;
 	}
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, id, expires, error) < 0)
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
 		goto nla_put_failure;
 
 	return nlmsg_end(skb, nlh);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 951bcf35b21c..87d3fcc302d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,8 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
 
-	rt_transfer_peer(&xdst->u.rt, rt);
-
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
@@ -210,11 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 
 	dst_destroy_metrics_generic(dst);
 
-	if (rt_has_peer(&xdst->u.rt)) {
-		struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt);
-		inet_putpeer(peer);
-	}
-
 	xfrm_dst_destroy(xdst);
 }
 
-- 
cgit v1.2.3


From 2c53040f018b6c36a46eec75b9b937aaa5f78e6d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 10 Jul 2012 10:55:09 +0000
Subject: net: Fix (nearly-)kernel-doc comments for various functions

Fix incorrect start markers, wrapped summary lines, missing section
breaks, incorrect separators, and some name mismatches.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/9p/trans_virtio.c                  |  2 +-
 net/appletalk/ddp.c                    |  8 +++---
 net/batman-adv/bridge_loop_avoidance.c | 51 ++++++++++++++++++++++------------
 net/batman-adv/hash.h                  |  3 +-
 net/batman-adv/main.h                  |  3 +-
 net/batman-adv/types.h                 |  3 +-
 net/core/dev.c                         |  8 ++++--
 net/core/rtnetlink.c                   |  2 +-
 net/core/skbuff.c                      |  5 ++--
 net/dccp/ackvec.h                      |  7 +++--
 net/dccp/ccid.c                        |  1 +
 net/dccp/ccids/ccid3.c                 |  8 ++++--
 net/dccp/ccids/lib/loss_interval.c     |  1 +
 net/dccp/ccids/lib/packet_history.c    |  3 +-
 net/dccp/ccids/lib/tfrc_equation.c     |  2 ++
 net/dccp/dccp.h                        |  1 +
 net/dccp/feat.c                        | 10 +++++++
 net/dccp/input.c                       |  1 +
 net/dccp/options.c                     |  1 +
 net/dccp/output.c                      |  1 +
 net/ethernet/eth.c                     |  3 ++
 net/ipv4/ipmr.c                        |  4 +--
 net/ipv6/ip6_tunnel.c                  |  2 +-
 net/llc/af_llc.c                       |  2 +-
 net/llc/llc_station.c                  | 16 +++++------
 net/mac80211/mesh.c                    |  2 +-
 net/mac80211/mesh_hwmp.c               |  7 +++--
 net/mac80211/mesh_pathtbl.c            |  4 +--
 net/mac80211/mesh_plink.c              |  5 ++--
 net/mac80211/rx.c                      |  2 +-
 net/netfilter/xt_TPROXY.c              |  4 +--
 net/netlink/genetlink.c                |  2 +-
 net/rds/page.c                         |  9 +++---
 net/rxrpc/ar-output.c                  |  2 +-
 net/sunrpc/backchannel_rqst.c          |  9 +++---
 net/sunrpc/clnt.c                      |  2 +-
 net/sunrpc/xdr.c                       | 12 ++++----
 net/sunrpc/xprt.c                      |  2 +-
 net/tipc/bcast.c                       | 10 +++----
 net/tipc/bearer.c                      |  7 ++---
 net/tipc/bearer.h                      |  2 +-
 net/tipc/link.c                        | 22 +++++++--------
 net/tipc/name_table.c                  | 10 +++----
 net/tipc/port.c                        |  2 +-
 net/tipc/port.h                        |  1 +
 net/x25/x25_route.c                    |  2 +-
 46 files changed, 163 insertions(+), 103 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 2a167658bb95..35b8911b1c8e 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -212,7 +212,7 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
  * this takes a list of pages.
  * @sg: scatter/gather list to pack into
  * @start: which segment of the sg_list to start at
- * @**pdata: a list of pages to add into sg.
+ * @pdata: a list of pages to add into sg.
  * @nr_pages: number of pages to pack into the scatter/gather list
  * @data: data to pack into scatter/gather list
  * @count: amount of data to pack into the scatter/gather list
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 86852963b7f7..33475291c9c1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -129,8 +129,8 @@ found:
 
 /**
  * atalk_find_or_insert_socket - Try to find a socket matching ADDR
- * @sk - socket to insert in the list if it is not there already
- * @sat - address to search for
+ * @sk: socket to insert in the list if it is not there already
+ * @sat: address to search for
  *
  * Try to find a socket matching ADDR in the socket list, if found then return
  * it. If not, insert SK into the socket list.
@@ -1066,8 +1066,8 @@ static int atalk_release(struct socket *sock)
 
 /**
  * atalk_pick_and_bind_port - Pick a source port when one is not given
- * @sk - socket to insert into the tables
- * @sat - address to search for
+ * @sk: socket to insert into the tables
+ * @sat: address to search for
  *
  * Pick a source port when one is not given. If we can find a suitable free
  * one, we insert the socket into the tables using it.
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 49e10d91c00b..3483e4035cbe 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -162,12 +162,13 @@ static struct batadv_claim *batadv_claim_hash_find(struct batadv_priv *bat_priv,
 	return claim_tmp;
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_backbone_hash_find - looks for a claim in the hash
+ * @bat_priv: the bat priv with all the soft interface information
  * @addr: the address of the originator
  * @vid: the VLAN ID
  *
- * looks for a claim in the hash, and returns it if found
- * or NULL otherwise.
+ * Returns claim if found or NULL otherwise.
  */
 static struct batadv_backbone_gw *
 batadv_backbone_hash_find(struct batadv_priv *bat_priv,
@@ -242,12 +243,12 @@ batadv_bla_del_backbone_claims(struct batadv_backbone_gw *backbone_gw)
 	backbone_gw->crc = BATADV_BLA_CRC_INIT;
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_send_claim - sends a claim frame according to the provided info
+ * @bat_priv: the bat priv with all the soft interface information
  * @orig: the mac address to be announced within the claim
  * @vid: the VLAN ID
  * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...)
- *
- * sends a claim frame according to the provided info.
  */
 static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac,
 				  short vid, int claimtype)
@@ -348,7 +349,9 @@ out:
 		batadv_hardif_free_ref(primary_if);
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_get_backbone_gw
+ * @bat_priv: the bat priv with all the soft interface information
  * @orig: the mac address of the originator
  * @vid: the VLAN ID
  *
@@ -520,12 +523,12 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
 
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_add_claim - Adds a claim in the claim hash
+ * @bat_priv: the bat priv with all the soft interface information
  * @mac: the mac address of the claim
  * @vid: the VLAN ID of the frame
  * @backbone_gw: the backbone gateway which claims it
- *
- * Adds a claim in the claim hash.
  */
 static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
 				 const uint8_t *mac, const short vid,
@@ -743,7 +746,9 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
 	return 1;
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_check_claim_group
+ * @bat_priv: the bat priv with all the soft interface information
  * @hw_src: the Hardware source in the ARP Header
  * @hw_dst: the Hardware destination in the ARP Header
  * @ethhdr: pointer to the Ethernet header of the claim frame
@@ -975,7 +980,9 @@ purge_now:
 	}
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_purge_claims
+ * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the selected primary interface, may be NULL if now is set
  * @now: whether the whole hash shall be wiped now
  *
@@ -1023,7 +1030,9 @@ purge_now:
 	}
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_update_orig_address
+ * @bat_priv: the bat priv with all the soft interface information
  * @primary_if: the new selected primary_if
  * @oldif: the old primary interface, may be NULL
  *
@@ -1193,7 +1202,9 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
 	return 0;
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_check_bcast_duplist
+ * @bat_priv: the bat priv with all the soft interface information
  * @bcast_packet: originator mac address
  * @hdr_size: maximum length of the frame
  *
@@ -1297,7 +1308,9 @@ int batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig)
 }
 
 
-/* @skb: the frame to be checked
+/**
+ * batadv_bla_is_backbone_gw
+ * @skb: the frame to be checked
  * @orig_node: the orig_node of the frame
  * @hdr_size: maximum length of the frame
  *
@@ -1363,7 +1376,9 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
 		batadv_hardif_free_ref(primary_if);
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_rx
+ * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
  *
@@ -1450,7 +1465,9 @@ out:
 	return ret;
 }
 
-/* @bat_priv: the bat priv with all the soft interface information
+/**
+ * batadv_bla_tx
+ * @bat_priv: the bat priv with all the soft interface information
  * @skb: the frame to be checked
  * @vid: the VLAN ID of the frame
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 83990e318e43..977de9c75fc2 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -81,7 +81,8 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash,
 	batadv_hash_destroy(hash);
 }
 
-/*	hash_add - adds data to the hashtable
+/**
+ *	batadv_hash_add - adds data to the hashtable
  *	@hash: storage hash table
  *	@compare: callback to determine if 2 hash elements are identical
  *	@choose: callback calculating the hash index
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index b8d4ac17f001..5d8fa0757947 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -216,7 +216,8 @@ static inline int batadv_compare_eth(const void *data1, const void *data2)
 	return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
 }
 
-/* has_timed_out - compares current time (jiffies) and timestamp + timeout
+/**
+ * has_timed_out - compares current time (jiffies) and timestamp + timeout
  * @timestamp:		base value to compare with (in jiffies)
  * @timeout:		added to base value before comparing (in milliseconds)
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 2141c1304898..12635fd2c3d3 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -44,7 +44,8 @@ struct batadv_hard_iface {
 	struct rcu_head rcu;
 };
 
-/*	batadv_orig_node - structure for orig_list maintaining nodes of mesh
+/**
+ *	struct batadv_orig_node - structure for orig_list maintaining nodes of mesh
  *	@primary_addr: hosts primary interface address
  *	@last_seen: when last packet from this node was received
  *	@bcast_seqno_reset: time when the broadcast seqno window was reset
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c21548e5b31..5ab6f4b37c0c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1691,7 +1691,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_unlock();
 }
 
-/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
  * @dev: Network device
  * @txq: number of queues available
  *
@@ -1793,7 +1794,8 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 #endif
 
-/* netif_get_num_default_rss_queues - default number of RSS queues
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
  *
  * This routine should set an upper limit on the number of RSS queues
  * used by default by multiqueue devices.
@@ -5670,7 +5672,7 @@ int netdev_refcnt_read(const struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_refcnt_read);
 
-/*
+/**
  * netdev_wait_allrefs - wait until all references are gone.
  *
  * This is called when unregistering network devices.
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 64127eee786d..045db8ad87c8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2174,7 +2174,7 @@ skip:
 }
 
 /**
- * ndo_dflt_fdb_dump: default netdevice operation to dump an FDB table.
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
  * @nlh: netlink message header
  * @dev: netdevice
  *
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a789a807ec3..506f678e9d95 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -713,7 +713,8 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
-/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
+/**
+ *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
  *	@skb: the skb to modify
  *	@gfp_mask: allocation priority
  *
@@ -2614,7 +2615,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
 EXPORT_SYMBOL(skb_find_text);
 
 /**
- * skb_append_datato_frags: - append the user data to a skb
+ * skb_append_datato_frags - append the user data to a skb
  * @sk: sock  structure
  * @skb: skb structure to be appened with user data.
  * @getfrag: call back function to be used for getting the user data
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index e2ab0627a5ff..a269aa7f7923 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -50,7 +50,8 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
 	return *cell & ~DCCPAV_MAX_RUNLEN;
 }
 
-/** struct dccp_ackvec - Ack Vector main data structure
+/**
+ * struct dccp_ackvec - Ack Vector main data structure
  *
  * This implements a fixed-size circular buffer within an array and is largely
  * based on Appendix A of RFC 4340.
@@ -76,7 +77,8 @@ struct dccp_ackvec {
 	struct list_head	av_records;
 };
 
-/** struct dccp_ackvec_record - Records information about sent Ack Vectors
+/**
+ * struct dccp_ackvec_record - Records information about sent Ack Vectors
  *
  * These list entries define the additional information which the HC-Receiver
  * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
@@ -121,6 +123,7 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
  * @len:	length of @vec
  * @nonce:	whether @vec had an ECN nonce of 0 or 1
  * @node:	FIFO - arranged in descending order of ack_ackno
+ *
  * This structure is used by CCIDs to access Ack Vectors in a received skb.
  */
 struct dccp_ackvec_parsed {
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 48b585a5cba7..597557254ddb 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -46,6 +46,7 @@ bool ccid_support_check(u8 const *ccid_array, u8 array_len)
  * ccid_get_builtin_ccids  -  Populate a list of built-in CCIDs
  * @ccid_array: pointer to copy into
  * @array_len: value to return length into
+ *
  * This function allocates memory - caller must see that it is freed after use.
  */
 int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 8c67bedf85b0..d65e98798eca 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -113,6 +113,7 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
 /**
  * ccid3_hc_tx_update_x  -  Update allowed sending rate X
  * @stamp: most recent time if available - can be left NULL.
+ *
  * This function tracks draft rfc3448bis, check there for latest details.
  *
  * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
@@ -161,9 +162,11 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	}
 }
 
-/*
- *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
+/**
+ *	ccid3_hc_tx_update_s - Track the mean packet size `s'
  *	@len: DCCP packet payload size in bytes
+ *
+ *	cf. RFC 4342, 5.3 and  RFC 3448, 4.1
  */
 static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
 {
@@ -270,6 +273,7 @@ out:
 /**
  * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
  * @skb: next packet candidate to send on @sk
+ *
  * This function uses the convention of ccid_packet_dequeue_eval() and
  * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
  */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 497723c4d4bb..57f9fd78c4df 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -133,6 +133,7 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
  * @rh:		   Receive history containing a fresh loss event
  * @calc_first_li: Caller-dependent routine to compute length of first interval
  * @sk:		   Used by @calc_first_li in caller-specific way (subtyping)
+ *
  * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
  */
 int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index de8fe294bf0b..08df7a3acb3d 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -315,6 +315,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
  *  @ndp:	    The NDP count belonging to @skb
  *  @calc_first_li: Caller-dependent computation of first loss interval in @lh
  *  @sk:	    Used by @calc_first_li (see tfrc_lh_interval_add)
+ *
  *  Chooses action according to pending loss, updates LI database when a new
  *  loss was detected, and does required post-processing. Returns 1 when caller
  *  should send feedback, 0 otherwise.
@@ -387,7 +388,7 @@ static inline struct tfrc_rx_hist_entry *
 }
 
 /**
- * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
  */
 static inline struct tfrc_rx_hist_entry *
 			tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index a052a4377e26..88ef98285bec 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -611,6 +611,7 @@ static inline u32 tfrc_binsearch(u32 fval, u8 small)
  * @s: packet size          in bytes
  * @R: RTT                  scaled by 1000000   (i.e., microseconds)
  * @p: loss ratio estimate  scaled by 1000000
+ *
  * Returns X_calc           in bytes per second (not scaled).
  */
 u32 tfrc_calc_x(u16 s, u32 R, u32 p)
@@ -659,6 +660,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 /**
  *  tfrc_calc_x_reverse_lookup  -  try to find p given f(p)
  *  @fvalue: function value to match, scaled by 1000000
+ *
  *  Returns closest match for p, also scaled by 1000000
  */
 u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 9040be049d8c..708e75bf623d 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -352,6 +352,7 @@ static inline int dccp_bad_service_code(const struct sock *sk,
  * @dccpd_opt_len: total length of all options (5.8) in the packet
  * @dccpd_seq: sequence number
  * @dccpd_ack_seq: acknowledgment number subheader field value
+ *
  * This is used for transmission as well as for reception.
  */
 struct dccp_skb_cb {
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 78a2ad70e1b0..9733ddbc96cb 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -350,6 +350,7 @@ static int __dccp_feat_activate(struct sock *sk, const int idx,
  * @feat_num: feature to activate, one of %dccp_feature_numbers
  * @local: whether local (1) or remote (0) @feat_num is meant
  * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ *
  * For general use this function is preferable over __dccp_feat_activate().
  */
 static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
@@ -446,6 +447,7 @@ static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
  * @head:  list to add to
  * @feat:  feature number
  * @local: whether the local (1) or remote feature with number @feat is meant
+ *
  * This is the only constructor and serves to ensure the above invariants.
  */
 static struct dccp_feat_entry *
@@ -504,6 +506,7 @@ static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
  * @feat: one of %dccp_feature_numbers
  * @local: whether local (1) or remote (0) @feat_num is being confirmed
  * @fval: pointer to NN/SP value to be inserted or NULL
+ *
  * Returns 0 on success, a Reset code for further processing otherwise.
  */
 static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
@@ -691,6 +694,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
  * @feat: an NN feature from %dccp_feature_numbers
  * @mandatory: use Mandatory option if 1
  * @nn_val: value to register (restricted to 4 bytes)
+ *
  * Note that NN features are local by definition (RFC 4340, 6.3.2).
  */
 static int __feat_register_nn(struct list_head *fn, u8 feat,
@@ -760,6 +764,7 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
  * dccp_feat_nn_get  -  Query current/pending value of NN feature
  * @sk: DCCP socket of an established connection
  * @feat: NN feature number from %dccp_feature_numbers
+ *
  * For a known NN feature, returns value currently being negotiated, or
  * current (confirmed) value if no negotiation is going on.
  */
@@ -790,6 +795,7 @@ EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
  * @sk: DCCP socket of an established connection
  * @feat: NN feature number from %dccp_feature_numbers
  * @nn_val: the new value to use
+ *
  * This function is used to communicate NN updates out-of-band.
  */
 int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
@@ -930,6 +936,7 @@ static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
  * @fn: feature-negotiation list to update
  * @id: CCID number to track
  * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ *
  * This function needs to be called after registering all other features.
  */
 static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
@@ -953,6 +960,7 @@ static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
 /**
  * dccp_feat_finalise_settings  -  Finalise settings before starting negotiation
  * @dp: client or listening socket (settings will be inherited)
+ *
  * This is called after all registrations (socket initialisation, sysctls, and
  * sockopt calls), and before sending the first packet containing Change options
  * (ie. client-Request or server-Response), to ensure internal consistency.
@@ -1284,6 +1292,7 @@ confirmation_failed:
  * @feat:	NN number, one of %dccp_feature_numbers
  * @val:	NN value
  * @len:	length of @val in bytes
+ *
  * This function combines the functionality of change_recv/confirm_recv, with
  * the following differences (reset codes are the same):
  *    - cleanup after receiving the Confirm;
@@ -1379,6 +1388,7 @@ fast_path_failed:
  * @feat: one of %dccp_feature_numbers
  * @val: value contents of @opt
  * @len: length of @val in bytes
+ *
  * Returns 0 on success, a Reset code for ending the connection otherwise.
  */
 int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
diff --git a/net/dccp/input.c b/net/dccp/input.c
index bc93a333931e..14cdafad7a90 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -710,6 +710,7 @@ EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
 /**
  *  dccp_sample_rtt  -  Validate and finalise computation of RTT sample
  *  @delta:	number of microseconds between packet and acknowledgment
+ *
  *  The routine is kept generic to work in different contexts. It should be
  *  called immediately when the ACK used for the RTT sample arrives.
  */
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 68fa6b7a3e01..a58e0b634050 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -527,6 +527,7 @@ int dccp_insert_option_mandatory(struct sk_buff *skb)
  * @val: NN value or SP array (preferred element first) to copy
  * @len: true length of @val in bytes (excluding first element repetition)
  * @repeat_first: whether to copy the first element of @val twice
+ *
  * The last argument is used to construct Confirm options, where the preferred
  * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
  * lists are kept such that the preferred entry is always first, so we only need
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 787367308797..d17fc90a74b6 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -214,6 +214,7 @@ void dccp_write_space(struct sock *sk)
  * dccp_wait_for_ccid  -  Await CCID send permission
  * @sk:    socket to wait for
  * @delay: timeout in jiffies
+ *
  * This is used by CCIDs which need to delay the send time in process context.
  */
 static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index db6a6c17d790..4efad533e5f6 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -232,6 +232,7 @@ EXPORT_SYMBOL(eth_header_parse);
  * @neigh: source neighbour
  * @hh: destination cache entry
  * @type: Ethernet type field
+ *
  * Create an Ethernet header template from the neighbour.
  */
 int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
@@ -274,6 +275,7 @@ EXPORT_SYMBOL(eth_header_cache_update);
  * eth_mac_addr - set new Ethernet hardware address
  * @dev: network device
  * @p: socket address
+ *
  * Change hardware address of device.
  *
  * This doesn't change hardware matching, so needs to be overridden
@@ -331,6 +333,7 @@ const struct header_ops eth_header_ops ____cacheline_aligned = {
 /**
  * ether_setup - setup Ethernet network device
  * @dev: network device
+ *
  * Fill in the fields of the device structure with Ethernet-generic values.
  */
 void ether_setup(struct net_device *dev)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b4ac39f11d19..5716c6b808d6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -524,8 +524,8 @@ failure:
 }
 #endif
 
-/*
- *	Delete a VIF entry
+/**
+ *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
  */
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 04a3cba2c123..6af3fcfdcbbd 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -252,7 +252,7 @@ static void ip6_dev_free(struct net_device *dev)
 }
 
 /**
- * ip6_tnl_create() - create a new tunnel
+ * ip6_tnl_create - create a new tunnel
  *   @p: tunnel parameters
  *   @pt: pointer to new tunnel
  *
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index fe5453c3e719..f6fe4d400502 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1024,7 +1024,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
  *	@sock: Socket to set options on.
  *	@level: Socket level user is requesting operations on.
  *	@optname: Operation name.
- *	@optval User provided operation data.
+ *	@optval: User provided operation data.
  *	@optlen: Length of optval.
  *
  *	Set various connection specific parameters.
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index cf4aea3ba30f..39a8d8924b9c 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -30,12 +30,12 @@
  *
  * SAP and connection resource manager, one per adapter.
  *
- * @state - state of station
- * @xid_r_count - XID response PDU counter
- * @mac_sa - MAC source address
- * @sap_list - list of related SAPs
- * @ev_q - events entering state mach.
- * @mac_pdu_q - PDUs ready to send to MAC
+ * @state: state of station
+ * @xid_r_count: XID response PDU counter
+ * @mac_sa: MAC source address
+ * @sap_list: list of related SAPs
+ * @ev_q: events entering state mach.
+ * @mac_pdu_q: PDUs ready to send to MAC
  */
 struct llc_station {
 	u8			    state;
@@ -646,7 +646,7 @@ static void llc_station_service_events(void)
 }
 
 /**
- *	llc_station_state_process: queue event and try to process queue.
+ *	llc_station_state_process - queue event and try to process queue.
  *	@skb: Address of the event
  *
  *	Queues an event (on the station event queue) for handling by the
@@ -672,7 +672,7 @@ static void llc_station_ack_tmr_cb(unsigned long timeout_data)
 	}
 }
 
-/*
+/**
  *	llc_station_rcv - send received pdu to the station state machine
  *	@skb: received frame.
  *
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 764593d65fc3..6fac18c0423f 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -133,7 +133,7 @@ bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
 }
 
 /**
- * mesh_accept_plinks_update: update accepting_plink in local mesh beacons
+ * mesh_accept_plinks_update - update accepting_plink in local mesh beacons
  *
  * @sdata: mesh interface in which mesh beacons are going to be updated
  */
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index fb7b6a11d0ba..494bc39f61a4 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1054,12 +1054,15 @@ enddiscovery:
 	kfree(preq_node);
 }
 
-/* mesh_nexthop_resolve - lookup next hop for given skb and start path
- * discovery if no forwarding information is found.
+/**
+ * mesh_nexthop_resolve - lookup next hop; conditionally start path discovery
  *
  * @skb: 802.11 frame to be sent
  * @sdata: network subif the frame will be sent through
  *
+ * Lookup next hop for given skb and start path discovery if no
+ * forwarding information is found.
+ *
  * Returns: 0 if the next hop was found and -ENOENT if the frame was queued.
  * skb is freeed here if no mpath could be allocated.
  */
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index c9ae931dd693..075bc535c601 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -778,7 +778,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
 /**
  * mesh_path_flush_by_nexthop - Deletes mesh paths if their next hop matches
  *
- * @sta - mesh peer to match
+ * @sta: mesh peer to match
  *
  * RCU notes: this function is called when a mesh plink transitions from
  * PLINK_ESTAB to any other state, since PLINK_ESTAB state is the only one that
@@ -833,7 +833,7 @@ static void table_flush_by_iface(struct mesh_table *tbl,
  *
  * This function deletes both mesh paths as well as mesh portal paths.
  *
- * @sdata - interface data to match
+ * @sdata: interface data to match
  *
  */
 void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index a1dbd1540276..9ad74dd87a7b 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -99,7 +99,7 @@ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata,
 	return sta;
 }
 
-/*
+/**
  * mesh_set_ht_prot_mode - set correct HT protection mode
  *
  * Section 9.23.3.5 of IEEE 80211-2012 describes the protection rules for HT
@@ -320,7 +320,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
-/* mesh_peer_init - initialize new mesh peer and return resulting sta_info
+/**
+ * mesh_peer_init - initialize new mesh peer and return resulting sta_info
  *
  * @sdata: local meshif
  * @addr: peer's address
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 839cac8fab57..67edd69e8421 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -94,7 +94,7 @@ ieee80211_rx_radiotap_len(struct ieee80211_local *local,
 	return len;
 }
 
-/*
+/**
  * ieee80211_add_rx_radiotap_header - add radiotap header
  *
  * add a radiotap header containing all the fields which the hardware provided.
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 146033a86de8..d7f195388f66 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -69,7 +69,7 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 }
 
 /**
- * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
  * @skb:	The skb being processed.
  * @laddr:	IPv4 address to redirect to or zero.
  * @lport:	TCP port to redirect to or zero.
@@ -220,7 +220,7 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
 }
 
 /**
- * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
  * @skb:	The skb being processed.
  * @tproto:	Transport protocol.
  * @thoff:	Transport protocol header offset.
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 32761b53015e..62ebe3c6291c 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -504,7 +504,7 @@ EXPORT_SYMBOL(genl_unregister_family);
  * @pid: netlink pid the message is addressed to
  * @seq: sequence number (usually the one of the sender)
  * @family: generic netlink family
- * @flags netlink message flags
+ * @flags: netlink message flags
  * @cmd: generic netlink command
  *
  * Returns pointer to user specific header
diff --git a/net/rds/page.c b/net/rds/page.c
index 2499cd108421..9005a2c920ee 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -74,11 +74,12 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
 }
 EXPORT_SYMBOL_GPL(rds_page_copy_user);
 
-/*
- * Message allocation uses this to build up regions of a message.
+/**
+ * rds_page_remainder_alloc - build up regions of a message.
  *
- * @bytes - the number of bytes needed.
- * @gfp - the waiting behaviour of the allocation
+ * @scat: Scatter list for message
+ * @bytes: the number of bytes needed.
+ * @gfp: the waiting behaviour of the allocation
  *
  * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
  * kmap the pages, etc.
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 16ae88762d00..e1ac183d50bb 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -242,7 +242,7 @@ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
 
 EXPORT_SYMBOL(rxrpc_kernel_send_data);
 
-/*
+/**
  * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
  * @call: The call to be aborted
  * @abort_code: The abort code to stick into the ABORT packet
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 31def68a0f6e..5a3d675d2f2f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -176,13 +176,14 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
 
-/*
- * Destroys the backchannel preallocated structures.
+/**
+ * xprt_destroy_backchannel - Destroys the backchannel preallocated structures.
+ * @xprt:	the transport holding the preallocated strucures
+ * @max_reqs	the maximum number of preallocated structures to destroy
+ *
  * Since these structures may have been allocated by multiple calls
  * to xprt_setup_backchannel, we only destroy up to the maximum number
  * of reqs specified by the caller.
- * @xprt:	the transport holding the preallocated strucures
- * @max_reqs	the maximum number of preallocated structures to destroy
  */
 void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f56f045778ae..00eb859b7de5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -385,7 +385,7 @@ out_no_rpciod:
 	return ERR_PTR(err);
 }
 
-/*
+/**
  * rpc_create - create an RPC client and transport with one call
  * @args: rpc_clnt create argument structure
  *
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index fddcccfcdf76..0cf165580d8d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -180,7 +180,9 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages);
 
 /*
  * Helper routines for doing 'memmove' like operations on a struct xdr_buf
- *
+ */
+
+/**
  * _shift_data_right_pages
  * @pages: vector of pages containing both the source and dest memory area.
  * @pgto_base: page vector address of destination
@@ -242,7 +244,7 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-/*
+/**
  * _copy_to_pages
  * @pages: array of pages
  * @pgbase: page vector address of destination
@@ -286,7 +288,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
 	flush_dcache_page(*pgto);
 }
 
-/*
+/**
  * _copy_from_pages
  * @p: pointer to destination
  * @pages: array of pages
@@ -326,7 +328,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 }
 EXPORT_SYMBOL_GPL(_copy_from_pages);
 
-/*
+/**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
  * @len: bytes to remove from buf->head[0]
@@ -399,7 +401,7 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 		buf->len = buf->buflen;
 }
 
-/*
+/**
  * xdr_shrink_pagelen
  * @buf: xdr_buf
  * @len: bytes to remove from buf->pages
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3c83035cdaa9..a5a402a7d21f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -531,7 +531,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task)
 }
 EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
 
-/*
+/**
  * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
  * @task: task whose timeout is to be set
  *
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 2625f5ebe3e8..d9df34fbd7ca 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -162,7 +162,7 @@ static void bclink_update_last_sent(struct tipc_node *node, u32 seqno)
 }
 
 
-/*
+/**
  * tipc_bclink_retransmit_to - get most recent node to request retransmission
  *
  * Called with bc_lock locked
@@ -270,7 +270,7 @@ exit:
 	spin_unlock_bh(&bc_lock);
 }
 
-/*
+/**
  * tipc_bclink_update_link_state - update broadcast link state
  *
  * tipc_net_lock and node lock set
@@ -330,7 +330,7 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, u32 last_sent)
 	}
 }
 
-/*
+/**
  * bclink_peek_nack - monitor retransmission requests sent by other nodes
  *
  * Delay any upcoming NACK by this node if another node has already
@@ -381,7 +381,7 @@ exit:
 	return res;
 }
 
-/*
+/**
  * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet
  *
  * Called with both sending node's lock and bc_lock taken.
@@ -406,7 +406,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
 	}
 }
 
-/*
+/**
  * tipc_bclink_recv_pkt - receive a broadcast packet, and deliver upwards
  *
  * tipc_net_lock is read_locked, no other locks set
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a297e3a2e3e7..86b703f55092 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -172,8 +172,8 @@ struct sk_buff *tipc_media_get_names(void)
 
 /**
  * bearer_name_validate - validate & (optionally) deconstruct bearer name
- * @name - ptr to bearer name string
- * @name_parts - ptr to area for bearer name components (or NULL if not needed)
+ * @name: ptr to bearer name string
+ * @name_parts: ptr to area for bearer name components (or NULL if not needed)
  *
  * Returns 1 if bearer name is valid, otherwise 0.
  */
@@ -520,8 +520,7 @@ exit:
 }
 
 /**
- * tipc_block_bearer(): Block the bearer with the given name,
- *                      and reset all its links
+ * tipc_block_bearer - Block the bearer with the given name, and reset all its links
  */
 int tipc_block_bearer(const char *name)
 {
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index e3b2be37fb31..4680de118aff 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -57,7 +57,7 @@
  */
 #define TIPC_MEDIA_TYPE_ETH	1
 
-/*
+/**
  * struct tipc_media_addr - destination address used by TIPC bearers
  * @value: address info (format defined by media)
  * @media_id: TIPC media type identifier
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 7a614f43549d..f6bf4830ddfe 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -153,8 +153,8 @@ int tipc_link_is_active(struct tipc_link *l_ptr)
 
 /**
  * link_name_validate - validate & (optionally) deconstruct tipc_link name
- * @name - ptr to link name string
- * @name_parts - ptr to area for link name components (or NULL if not needed)
+ * @name: ptr to link name string
+ * @name_parts: ptr to area for link name components (or NULL if not needed)
  *
  * Returns 1 if link name is valid, otherwise 0.
  */
@@ -944,7 +944,7 @@ int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector)
 	return res;
 }
 
-/*
+/**
  * tipc_link_send_names - send name table entries to new neighbor
  *
  * Send routine for bulk delivery of name table messages when contact
@@ -1787,7 +1787,7 @@ cont:
 	read_unlock_bh(&tipc_net_lock);
 }
 
-/*
+/**
  * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue
  *
  * Returns increase in queue length (i.e. 0 or 1)
@@ -2635,8 +2635,8 @@ void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window)
 
 /**
  * link_find_link - locate link by name
- * @name - ptr to link name string
- * @node - ptr to area to be filled with ptr to associated node
+ * @name: ptr to link name string
+ * @node: ptr to area to be filled with ptr to associated node
  *
  * Caller must hold 'tipc_net_lock' to ensure node and bearer are not deleted;
  * this also prevents link deletion.
@@ -2671,8 +2671,8 @@ static struct tipc_link *link_find_link(const char *name,
 /**
  * link_value_is_valid -- validate proposed link tolerance/priority/window
  *
- * @cmd - value type (TIPC_CMD_SET_LINK_*)
- * @new_value - the new value
+ * @cmd: value type (TIPC_CMD_SET_LINK_*)
+ * @new_value: the new value
  *
  * Returns 1 if value is within range, 0 if not.
  */
@@ -2693,9 +2693,9 @@ static int link_value_is_valid(u16 cmd, u32 new_value)
 
 /**
  * link_cmd_set_value - change priority/tolerance/window for link/bearer/media
- * @name - ptr to link, bearer, or media name
- * @new_value - new value of link, bearer, or media setting
- * @cmd - which link, bearer, or media attribute to set (TIPC_CMD_SET_LINK_*)
+ * @name: ptr to link, bearer, or media name
+ * @new_value: new value of link, bearer, or media setting
+ * @cmd: which link, bearer, or media attribute to set (TIPC_CMD_SET_LINK_*)
  *
  * Caller must hold 'tipc_net_lock' to ensure link/bearer/media is not deleted.
  *
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 010f24a59da2..13fb9d559ea5 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -191,7 +191,7 @@ static void nameseq_delete_empty(struct name_seq *seq)
 	}
 }
 
-/*
+/**
  * nameseq_find_subseq - find sub-sequence (if any) matching a name instance
  *
  * Very time-critical, so binary searches through sub-sequence array.
@@ -435,7 +435,7 @@ found:
 }
 
 /**
- * tipc_nameseq_subscribe: attach a subscription, and issue
+ * tipc_nameseq_subscribe - attach a subscription, and issue
  * the prescribed number of events if there is any sub-
  * sequence overlapping with the requested sequence
  */
@@ -520,7 +520,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
 	return publ;
 }
 
-/*
+/**
  * tipc_nametbl_translate - perform name translation
  *
  * On entry, 'destnode' is the search domain used during translation.
@@ -751,7 +751,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
 
 
 /**
- * subseq_list: print specified sub-sequence contents into the given buffer
+ * subseq_list - print specified sub-sequence contents into the given buffer
  */
 static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
 			u32 index)
@@ -787,7 +787,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
 }
 
 /**
- * nameseq_list: print specified name sequence contents into the given buffer
+ * nameseq_list - print specified name sequence contents into the given buffer
  */
 static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
 			 u32 type, u32 lowbound, u32 upbound, u32 index)
diff --git a/net/tipc/port.c b/net/tipc/port.c
index a1e828989d7a..70bf78bd5b75 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -69,7 +69,7 @@ static u32 port_peerport(struct tipc_port *p_ptr)
 	return msg_destport(&p_ptr->phdr);
 }
 
-/*
+/**
  * tipc_port_peer_msg - verify message was sent by connected port's peer
  *
  * Handles cases where the node's network address has changed from
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 98cbec9c4532..4660e3065790 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -79,6 +79,7 @@ typedef void (*tipc_continue_event) (void *usr_handle, u32 portref);
  * struct user_port - TIPC user port (used with native API)
  * @usr_handle: user-specified field
  * @ref: object reference to associated TIPC port
+ *
  * <various callback routines>
  */
 struct user_port {
diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c
index cf6366270054..277c8d2448d6 100644
--- a/net/x25/x25_route.c
+++ b/net/x25/x25_route.c
@@ -66,7 +66,7 @@ out:
 
 /**
  * __x25_remove_route - remove route from x25_route_list
- * @rt - route to remove
+ * @rt: route to remove
  *
  * Remove route from x25_route_list. If it was there.
  * Caller must hold x25_route_list_lock.
-- 
cgit v1.2.3


From ae86b9e3846f6fc5509dee721f2bdba1db8ab96a Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 10 Jul 2012 10:55:35 +0000
Subject: net: Fix non-kernel-doc comments with kernel-doc start marker

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ceph/pagelist.c | 14 ++++----------
 net/dcb/dcbnl.c     |  3 +--
 net/ipv4/tcp.c      |  3 +--
 net/tipc/socket.c   |  5 ++---
 4 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 13cb409a7bba..665cd23020ff 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -72,8 +72,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
 }
 EXPORT_SYMBOL(ceph_pagelist_append);
 
-/**
- * Allocate enough pages for a pagelist to append the given amount
+/* Allocate enough pages for a pagelist to append the given amount
  * of data without without allocating.
  * Returns: 0 on success, -ENOMEM on error.
  */
@@ -95,9 +94,7 @@ int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
 }
 EXPORT_SYMBOL(ceph_pagelist_reserve);
 
-/**
- * Free any pages that have been preallocated.
- */
+/* Free any pages that have been preallocated. */
 int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
 {
 	while (!list_empty(&pl->free_list)) {
@@ -112,9 +109,7 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
 }
 EXPORT_SYMBOL(ceph_pagelist_free_reserve);
 
-/**
- * Create a truncation point.
- */
+/* Create a truncation point. */
 void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
 			      struct ceph_pagelist_cursor *c)
 {
@@ -124,8 +119,7 @@ void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
 }
 EXPORT_SYMBOL(ceph_pagelist_set_cursor);
 
-/**
- * Truncate a pagelist to the given point. Move extra pages to reserve.
+/* Truncate a pagelist to the given point. Move extra pages to reserve.
  * This won't sleep.
  * Returns: 0 on success,
  *          -EINVAL if the pagelist doesn't match the trunc point pagelist
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 013da86575e8..81f2bb62dea3 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -28,8 +28,7 @@
 #include <linux/module.h>
 #include <net/sock.h>
 
-/**
- * Data Center Bridging (DCB) is a collection of Ethernet enhancements
+/* Data Center Bridging (DCB) is a collection of Ethernet enhancements
  * intended to allow network traffic with differing requirements
  * (highly reliable, no drops vs. best effort vs. low latency) to operate
  * and co-exist on Ethernet.  Current DCB features are:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29aa0c800cd0..d902da96d154 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3310,8 +3310,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 
 #endif
 
-/**
- * Each Responder maintains up to two secret values concurrently for
+/* Each Responder maintains up to two secret values concurrently for
  * efficient secret rollover.  Each secret value has 4 states:
  *
  * Generating.  (tcp_secret_generating != tcp_secret_primary)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 11a863d81421..1ebb49f3ddbe 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1699,9 +1699,8 @@ static int getsockopt(struct socket *sock,
 	return put_user(sizeof(value), ol);
 }
 
-/**
- * Protocol switches for the various types of TIPC sockets
- */
+/* Protocol switches for the various types of TIPC sockets */
+
 static const struct proto_ops msg_ops = {
 	.owner		= THIS_MODULE,
 	.family		= AF_TIPC,
-- 
cgit v1.2.3


From 2100844ca9d7055d5cddce2f8ed13af94c01f85b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 11 Jul 2012 17:18:04 -0700
Subject: tcp: Fix out of bounds access to tcpm_vals

The recent patch "tcp: Maintain dynamic metrics in local cache." introduced
an out of bounds access due to what appears to be a typo.   I believe this
change should resolve the issue by replacing the access to RTAX_CWND with
TCP_METRIC_CWND.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 1fd83d3118fe..5a38a2d5a95b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -412,7 +412,7 @@ void tcp_update_metrics(struct sock *sk)
 				       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
 		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
 			val = tcp_metric_get(tm, TCP_METRIC_CWND);
-			tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
+			tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
 		}
 	} else {
 		/* Else slow start did not finish, cwnd is non-sense,
-- 
cgit v1.2.3


From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Jul 2012 05:50:31 +0000
Subject: tcp: TCP Small Queues

This introduce TSQ (TCP Small Queues)

TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.

sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.

TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.

As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.

This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.

Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.

Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)

I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.

As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.

If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.

[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
  but some drivers call it in their start_xmit() handler.
  These drivers should at least use BQL, or else a single TCP
  session can still fill the whole NIC TX ring, since TSQ will
  have no effect.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  14 +++
 include/linux/tcp.h                    |   9 ++
 include/net/sock.h                     |   2 +
 include/net/tcp.h                      |   4 +
 net/core/sock.c                        |   4 +
 net/ipv4/sysctl_net_ipv4.c             |   7 ++
 net/ipv4/tcp.c                         |   6 ++
 net/ipv4/tcp_ipv4.c                    |   1 +
 net/ipv4/tcp_minisocks.c               |   1 +
 net/ipv4/tcp_output.c                  | 154 ++++++++++++++++++++++++++++++++-
 net/ipv6/tcp_ipv6.c                    |   1 +
 11 files changed, 202 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79e9b05..e20c17a7d34e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
+tcp_limit_output_bytes - INTEGER
+	Controls TCP Small Queue limit per tcp socket.
+	TCP bulk sender tends to increase packets in flight until it
+	gets losses notifications. With SNDBUF autotuning, this can
+	result in a large amount of packets queued in qdisc/device
+	on the local machine, hurting latency of other flows, for
+	typical pfifo_fast qdiscs.
+	tcp_limit_output_bytes limits the number of bytes on qdisc
+	or device to reduce artificial RTT/cwnd and reduce bufferbloat.
+	Note: For GSO/TSO enabled flows, we try to have at least two
+	packets in flight. Reducing tcp_limit_output_bytes might also
+	reduce the size of individual GSO packet (64KB being the max)
+	Default: 131072
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2de9cf46f9fc..1888169e07c7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -339,6 +339,9 @@ struct tcp_sock {
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 
+	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	unsigned long	tsq_flags;
+
 	/* Data for direct copy to user */
 	struct {
 		struct sk_buff_head	prequeue;
@@ -494,6 +497,12 @@ struct tcp_sock {
 	struct tcp_cookie_values  *cookie_values;
 };
 
+enum tsq_flags {
+	TSQ_THROTTLED,
+	TSQ_QUEUED,
+	TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
+};
+
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index dcb54a0793ec..88de092df50f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -858,6 +858,8 @@ struct proto {
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
 
+	void		(*release_cb)(struct sock *sk);
+
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3618fefae049..439984b9af49 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_limit_output_bytes;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -321,6 +322,8 @@ extern struct proto tcp_prot;
 
 extern void tcp_init_mem(struct net *net);
 
+extern void tcp_tasklet_init(void);
+
 extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
@@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
 extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 			size_t size, int flags);
+extern void tcp_release_cb(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 const struct tcphdr *th, unsigned int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 929bdcc2383b..24039ac12426 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk)
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
+
+	if (sk->sk_prot->release_cb)
+		sk->sk_prot->release_cb(sk);
+
 	sk->sk_lock.owned = 0;
 	if (waitqueue_active(&sk->sk_lock.wq))
 		wake_up(&sk->sk_lock.wq);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 12aa0c5867c4..70730f7aeafe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_limit_output_bytes",
+		.data		= &sysctl_tcp_limit_output_bytes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d902da96d154..4252cd8f39fd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk)
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
+	INIT_LIST_HEAD(&tp->tsq_node);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 				  inet_csk(sk)->icsk_ext_hdr_len -
 				  tp->tcp_header_len);
 
+		/* TSQ : try to have two TSO segments in flight */
+		xmit_size_goal = min_t(u32, xmit_size_goal,
+				       sysctl_tcp_limit_output_bytes >> 1);
+
 		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
 
 		/* We try hard to avoid divides here */
@@ -3574,4 +3579,5 @@ void __init tcp_init(void)
 	tcp_secret_primary = &tcp_secret_one;
 	tcp_secret_retiring = &tcp_secret_two;
 	tcp_secret_secondary = &tcp_secret_two;
+	tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ddefd39ac0cf..01545a3fc0f2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2588,6 +2588,7 @@ struct proto tcp_prot = {
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
+	.release_cb		= tcp_release_cb,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 65608863fdee..c66f2ede160e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 
 		tcp_prequeue_init(newtp);
+		INIT_LIST_HEAD(&newtp->tsq_node);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e51e28..03854abfd9d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
  */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
 
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			   int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	return size;
 }
 
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+	struct tasklet_struct	tasklet;
+	struct list_head	head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct list_head *q, *n;
+	struct tcp_sock *tp;
+	struct sock *sk;
+
+	local_irq_save(flags);
+	list_splice_init(&tsq->head, &list);
+	local_irq_restore(flags);
+
+	list_for_each_safe(q, n, &list) {
+		tp = list_entry(q, struct tcp_sock, tsq_node);
+		list_del(&tp->tsq_node);
+
+		sk = (struct sock *)tp;
+		bh_lock_sock(sk);
+
+		if (!sock_owned_by_user(sk)) {
+			if ((1 << sk->sk_state) &
+			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+			     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+				tcp_write_xmit(sk,
+					       tcp_current_mss(sk),
+					       0, 0,
+					       GFP_ATOMIC);
+		} else {
+			/* defer the work to tcp_release_cb() */
+			set_bit(TSQ_OWNED, &tp->tsq_flags);
+		}
+		bh_unlock_sock(sk);
+
+		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+		sk_free(sk);
+	}
+}
+
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
+		if ((1 << sk->sk_state) &
+		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+		     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+			tcp_write_xmit(sk,
+				       tcp_current_mss(sk),
+				       0, 0,
+				       GFP_ATOMIC);
+	}
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+		INIT_LIST_HEAD(&tsq->head);
+		tasklet_init(&tsq->tasklet,
+			     tcp_tasklet_func,
+			     (unsigned long)tsq);
+	}
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+		unsigned long flags;
+		struct tsq_tasklet *tsq;
+
+		/* Keep a ref on socket.
+		 * This last ref will be released in tcp_tasklet_func()
+		 */
+		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+		/* queue this socket to tasklet queue */
+		local_irq_save(flags);
+		tsq = &__get_cpu_var(tsq_tasklet);
+		list_add(&tp->tsq_node, &tsq->head);
+		tasklet_schedule(&tsq->tasklet);
+		local_irq_restore(flags);
+	} else {
+		sock_wfree(skb);
+	}
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-	skb_set_owner_w(skb, sk);
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+			  tcp_wfree : sock_wfree;
+	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
 	th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
+
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 
+		/* TSQ : sk_wmem_alloc accounts skb truesize,
+		 * including skb overhead. But thats OK.
+		 */
+		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+			break;
+		}
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 61175cb2478f..70458a9cd837 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = {
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
+	.release_cb		= tcp_release_cb,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
-- 
cgit v1.2.3


From 1de9243bbfc451962ab716a3f7a7fda26d91c359 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 18:32:17 -0700
Subject: ipv4: Pull icmp socket delivery out into a helper function.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4a049449305f..18e39d1895d4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -634,18 +634,31 @@ out:;
 EXPORT_SYMBOL(icmp_send);
 
 
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	const struct net_protocol *ipprot;
+	int protocol = iph->protocol;
+
+	raw_icmp_error(skb, protocol, info);
+
+	rcu_read_lock();
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot && ipprot->err_handler)
+		ipprot->err_handler(skb, info);
+	rcu_read_unlock();
+}
+
 /*
  *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
  */
 
 static void icmp_unreach(struct sk_buff *skb)
 {
-	const struct net_protocol *ipprot;
 	const struct iphdr *iph;
 	struct icmphdr *icmph;
 	struct net *net;
 	u32 info = 0;
-	int protocol;
 
 	net = dev_net(skb_dst(skb)->dev);
 
@@ -726,19 +739,7 @@ static void icmp_unreach(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
 		goto out;
 
-	iph = (const struct iphdr *)skb->data;
-	protocol = iph->protocol;
-
-	/*
-	 *	Deliver ICMP message to raw sockets. Pretty useless feature?
-	 */
-	raw_icmp_error(skb, protocol, info);
-
-	rcu_read_lock();
-	ipprot = rcu_dereference(inet_protos[protocol]);
-	if (ipprot && ipprot->err_handler)
-		ipprot->err_handler(skb, info);
-	rcu_read_unlock();
+	icmp_socket_deliver(skb, info);
 
 out:
 	return;
-- 
cgit v1.2.3


From d3351b75a7169337877fe6f6f2c019154b6ec1ea Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 18:35:12 -0700
Subject: ipv4: Deliver ICMP redirects to sockets too.

And thus, we can remove the ping_err() hack.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 18e39d1895d4..588514627aa7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -782,13 +782,7 @@ static void icmp_redirect(struct sk_buff *skb)
 		break;
 	}
 
-	/* Ping wants to see redirects.
-         * Let's pretend they are errors of sorts... */
-	if (iph->protocol == IPPROTO_ICMP &&
-	    iph->ihl >= 5 &&
-	    pskb_may_pull(skb, (iph->ihl<<2)+8)) {
-		ping_err(skb, icmp_hdr(skb)->un.gateway);
-	}
+	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
 
 out:
 	return;
-- 
cgit v1.2.3


From d0da720f9f16a5023cc084bed8968702400f6e0f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 20:27:54 -0700
Subject: ipv4: Pull redirect instantiation out into a helper function.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 95bfa1ba5b28..a4de87f44c30 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1271,6 +1271,26 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
+static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
+{
+	struct neighbour *n;
+
+	if (rt->rt_gateway != old_gw)
+		return;
+
+	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+	if (n) {
+		if (!(n->nud_state & NUD_VALID)) {
+			neigh_event_send(n, NULL);
+		} else {
+			rt->rt_gateway = new_gw;
+			rt->rt_flags |= RTCF_REDIRECTED;
+			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+		}
+		neigh_release(n);
+	}
+}
+
 /* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
@@ -1311,8 +1331,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			rthp = &rt_hash_table[hash].chain;
 
 			while ((rt = rcu_dereference(*rthp)) != NULL) {
-				struct neighbour *n;
-
 				rthp = &rt->dst.rt_next;
 
 				if (rt->rt_key_dst != daddr ||
@@ -1322,21 +1340,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				    rt_is_expired(rt) ||
 				    !net_eq(dev_net(rt->dst.dev), net) ||
 				    rt->dst.error ||
-				    rt->dst.dev != dev ||
-				    rt->rt_gateway != old_gw)
+				    rt->dst.dev != dev)
 					continue;
 
-				n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
-				if (n) {
-					if (!(n->nud_state & NUD_VALID)) {
-						neigh_event_send(n, NULL);
-					} else {
-						rt->rt_gateway = new_gw;
-						rt->rt_flags |= RTCF_REDIRECTED;
-						call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
-					}
-					neigh_release(n);
-				}
+				ip_do_redirect(rt, old_gw, new_gw);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 94206125c4aac32e43c25bfe1b827e7ab993b7dc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 20:38:08 -0700
Subject: ipv4: Rearrange arguments to ip_rt_redirect()

Pass in the SKB rather than just the IP addresses, so that policy
and other aspects can reside in ip_rt_redirect() rather then
icmp_redirect().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  3 +--
 net/ipv4/icmp.c     | 36 ++++++------------------------------
 net/ipv4/route.c    | 23 +++++++++++++++++++----
 3 files changed, 26 insertions(+), 36 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 52362368af09..b1402787aecb 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -108,8 +108,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 
 struct in_device;
 extern int		ip_rt_init(void);
-extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
-				       __be32 src, struct net_device *dev);
+extern void		ip_rt_redirect(struct sk_buff *skb, __be32 new_gw);
 extern void		rt_cache_flush(struct net *net, int how);
 extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 588514627aa7..70a793559bb5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -755,40 +755,16 @@ out_err:
 
 static void icmp_redirect(struct sk_buff *skb)
 {
-	const struct iphdr *iph;
-
-	if (skb->len < sizeof(struct iphdr))
-		goto out_err;
+	if (skb->len < sizeof(struct iphdr)) {
+		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+		return;
+	}
 
-	/*
-	 *	Get the copied header of the packet that caused the redirect
-	 */
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		goto out;
-
-	iph = (const struct iphdr *)skb->data;
-
-	switch (icmp_hdr(skb)->code & 7) {
-	case ICMP_REDIR_NET:
-	case ICMP_REDIR_NETTOS:
-		/*
-		 * As per RFC recommendations now handle it as a host redirect.
-		 */
-	case ICMP_REDIR_HOST:
-	case ICMP_REDIR_HOSTTOS:
-		ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
-			       icmp_hdr(skb)->un.gateway,
-			       iph->saddr, skb->dev);
-		break;
-	}
+		return;
 
+	ip_rt_redirect(skb, icmp_hdr(skb)->un.gateway);
 	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
-
-out:
-	return;
-out_err:
-	ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
-	goto out;
 }
 
 /*
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a4de87f44c30..f8921b448c39 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1292,18 +1292,33 @@ static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
 }
 
 /* called in rcu_read_lock() section */
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
-		    __be32 saddr, struct net_device *dev)
+void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
 {
-	int s, i;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	__be32 old_gw = ip_hdr(skb)->saddr;
+	__be32 daddr = iph->daddr;
+	__be32 saddr = iph->saddr;
+	struct net_device *dev = skb->dev;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	__be32 skeys[2] = { saddr, 0 };
 	int    ikeys[2] = { dev->ifindex, 0 };
+	__be32 skeys[2] = { saddr, 0 };
 	struct net *net;
+	int s, i;
 
 	if (!in_dev)
 		return;
 
+	switch (icmp_hdr(skb)->code & 7) {
+	case ICMP_REDIR_NET:
+	case ICMP_REDIR_NETTOS:
+	case ICMP_REDIR_HOST:
+	case ICMP_REDIR_HOSTTOS:
+		break;
+
+	default:
+		return;
+	}
+
 	net = dev_net(dev);
 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
-- 
cgit v1.2.3


From e47a185b31dd2acd424fac7dc0efb96fc5b31a33 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 20:55:47 -0700
Subject: ipv4: Generalize ip_do_redirect() and hook into new
 dst_ops->redirect.

All of the redirect acceptance policy is now contained within.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_ops.h |  1 +
 net/ipv4/route.c      | 94 +++++++++++++++++++++++++++++----------------------
 2 files changed, 55 insertions(+), 40 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 4badc86e45d1..085931fa7ce0 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -25,6 +25,7 @@ struct dst_ops {
 	struct dst_entry *	(*negative_advice)(struct dst_entry *);
 	void			(*link_failure)(struct sk_buff *);
 	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
+	void			(*redirect)(struct dst_entry *dst, struct sk_buff *skb);
 	int			(*local_out)(struct sk_buff *skb);
 	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
 						struct sk_buff *skb,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f8921b448c39..f3d25656ddb0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -149,6 +149,7 @@ static void		 ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static void		 ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
 static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -179,6 +180,7 @@ static struct dst_ops ipv4_dst_ops = {
 	.negative_advice =	ipv4_negative_advice,
 	.link_failure =		ipv4_link_failure,
 	.update_pmtu =		ip_rt_update_pmtu,
+	.redirect =		ip_do_redirect,
 	.local_out =		__ip_local_out,
 	.neigh_lookup =		ipv4_neigh_lookup,
 };
@@ -1271,42 +1273,18 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
-{
-	struct neighbour *n;
-
-	if (rt->rt_gateway != old_gw)
-		return;
-
-	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
-	if (n) {
-		if (!(n->nud_state & NUD_VALID)) {
-			neigh_event_send(n, NULL);
-		} else {
-			rt->rt_gateway = new_gw;
-			rt->rt_flags |= RTCF_REDIRECTED;
-			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
-		}
-		neigh_release(n);
-	}
-}
-
-/* called in rcu_read_lock() section */
-void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
+static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
 {
 	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
+	struct net_device *dev = skb->dev;
 	__be32 daddr = iph->daddr;
 	__be32 saddr = iph->saddr;
-	struct net_device *dev = skb->dev;
-	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	int    ikeys[2] = { dev->ifindex, 0 };
-	__be32 skeys[2] = { saddr, 0 };
+	struct in_device *in_dev;
+	struct neighbour *n;
+	struct rtable *rt;
 	struct net *net;
-	int s, i;
-
-	if (!in_dev)
-		return;
 
 	switch (icmp_hdr(skb)->code & 7) {
 	case ICMP_REDIR_NET:
@@ -1319,6 +1297,14 @@ void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
 		return;
 	}
 
+	rt = (struct rtable *) dst;
+	if (rt->rt_gateway != old_gw)
+		return;
+
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		return;
+
 	net = dev_net(dev);
 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
@@ -1335,6 +1321,43 @@ void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
 			goto reject_redirect;
 	}
 
+	n = ipv4_neigh_lookup(dst, NULL, &new_gw);
+	if (n) {
+		if (!(n->nud_state & NUD_VALID)) {
+			neigh_event_send(n, NULL);
+		} else {
+			rt->rt_gateway = new_gw;
+			rt->rt_flags |= RTCF_REDIRECTED;
+			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+		}
+		neigh_release(n);
+	}
+	return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev))
+		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+				     "  Advised path = %pI4 -> %pI4\n",
+				     &old_gw, dev->name, &new_gw,
+				     &saddr, &daddr);
+#endif
+	;
+}
+
+/* called in rcu_read_lock() section */
+void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
+{
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	__be32 daddr = iph->daddr;
+	__be32 saddr = iph->saddr;
+	struct net_device *dev = skb->dev;
+	int    ikeys[2] = { dev->ifindex, 0 };
+	__be32 skeys[2] = { saddr, 0 };
+	struct net *net;
+	int s, i;
+
+	net = dev_net(dev);
 	for (s = 0; s < 2; s++) {
 		for (i = 0; i < 2; i++) {
 			unsigned int hash;
@@ -1358,21 +1381,12 @@ void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
 				    rt->dst.dev != dev)
 					continue;
 
-				ip_do_redirect(rt, old_gw, new_gw);
+				ip_do_redirect(&rt->dst, skb);
 			}
 		}
 	}
 	return;
 
-reject_redirect:
-#ifdef CONFIG_IP_ROUTE_VERBOSE
-	if (IN_DEV_LOG_MARTIANS(in_dev))
-		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
-				     "  Advised path = %pI4 -> %pI4\n",
-				     &old_gw, dev->name, &new_gw,
-				     &saddr, &daddr);
-#endif
-	;
 }
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
-- 
cgit v1.2.3


From b42597e2f36e2043756aa7462faddf630962f061 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 21:25:45 -0700
Subject: ipv4: Add ipv4_redirect() and ipv4_sk_redirect() helper functions.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  3 +++
 net/ipv4/route.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index b1402787aecb..6ab93eeb8660 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -180,6 +180,9 @@ static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 s
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			     int oif, u32 mark, u8 protocol, int flow_flags);
 extern void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
+extern void ipv4_redirect(struct sk_buff *skb, struct net *net,
+			  int oif, u32 mark, u8 protocol, int flow_flags);
+extern void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 extern void ip_rt_send_redirect(struct sk_buff *skb);
 
 extern unsigned int		inet_addr_type(struct net *net, __be32 addr);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3d25656ddb0..aabece6b729a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1590,6 +1590,34 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+		   int oif, u32 mark, u8 protocol, int flow_flags)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+
+	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
+			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
+	rt = __ip_route_output_key(net, &fl4);
+	if (!IS_ERR(rt)) {
+		ip_do_redirect(&rt->dst, skb);
+		ip_rt_put(rt);
+	}
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
+			     sk->sk_mark,
+			     inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			     inet_sk_flowi_flags(sk));
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rtable *rt = (struct rtable *) dst;
-- 
cgit v1.2.3


From 55be7a9c6074f749d617a7fc1914c9a23505438c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 21:27:49 -0700
Subject: ipv4: Add redirect support to all protocol icmp error handlers.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c         | 11 +++++++++++
 net/ipv4/ah4.c          | 18 +++++++++++++-----
 net/ipv4/esp4.c         | 18 +++++++++++++-----
 net/ipv4/ip_gre.c       |  9 ++++++++-
 net/ipv4/ipcomp.c       | 18 +++++++++++++-----
 net/ipv4/ipip.c         |  9 +++++++++
 net/ipv4/ping.c         |  1 +
 net/ipv4/raw.c          |  2 ++
 net/ipv4/tcp_ipv4.c     | 11 +++++++++++
 net/ipv4/udp.c          |  3 +++
 net/ipv4/xfrm4_policy.c | 10 ++++++++++
 net/sctp/input.c        | 16 ++++++++++++++++
 12 files changed, 110 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3eb76b5f221a..8f41a3190858 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -195,6 +195,14 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	} /* else let the usual retransmit timer handle it */
 }
 
+static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+	if (dst && dst->ops->redirect)
+		dst->ops->redirect(dst, skb);
+}
+
 /*
  * This routine is called by the ICMP module when it gets some sort of error
  * condition. If err < 0 then the socket should be closed and the error
@@ -259,6 +267,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 	}
 
 	switch (type) {
+	case ICMP_REDIRECT:
+		dccp_do_redirect(skb, sk);
+		goto out;
 	case ICMP_SOURCE_QUENCH:
 		/* Just silently ignore these. */
 		goto out;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 916d5ecaf6c6..a0d8392491c3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -398,17 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
 	struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
 	struct xfrm_state *x;
 
-	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return;
+	case ICMP_REDIRECT:
+		break;
+	default:
 		return;
+	}
 
 	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
 			      ah->spi, IPPROTO_AH, AF_INET);
 	if (!x)
 		return;
-	pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
-		 ntohl(ah->spi), ntohl(iph->daddr));
-	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+	else
+		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7b95b49a36ce..b61e9deb7c7e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -484,17 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
 	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
 	struct xfrm_state *x;
 
-	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return;
+	case ICMP_REDIRECT:
+		break;
+	default:
 		return;
+	}
 
 	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
 			      esph->spi, IPPROTO_ESP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-		 ntohl(esph->spi), ntohl(iph->daddr));
-	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+	else
+		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 594cec35ac4d..0c3123566d76 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -528,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 		if (code != ICMP_EXC_TTL)
 			return;
 		break;
+
+	case ICMP_REDIRECT:
+		break;
 	}
 
 	rcu_read_lock();
@@ -543,7 +546,11 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 				 t->parms.link, 0, IPPROTO_GRE, 0);
 		goto out;
 	}
-
+	if (type == ICMP_REDIRECT) {
+		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+			      IPPROTO_GRE, 0);
+		goto out;
+	}
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
 		goto out;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index b91375482d84..d3ab47e19a89 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,18 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
 	struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
 	struct xfrm_state *x;
 
-	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return;
+	case ICMP_REDIRECT:
+		break;
+	default:
 		return;
+	}
 
 	spi = htonl(ntohs(ipch->cpi));
 	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
 			      spi, IPPROTO_COMP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
-		 spi, &iph->daddr);
-	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+	else
+		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 715338a1b205..c2d0e6d8baaf 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -360,6 +360,8 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		if (code != ICMP_EXC_TTL)
 			return 0;
 		break;
+	case ICMP_REDIRECT:
+		break;
 	}
 
 	err = -ENOENT;
@@ -376,6 +378,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		goto out;
 	}
 
+	if (type == ICMP_REDIRECT) {
+		ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
+			      IPPROTO_IPIP, 0);
+		err = 0;
+		goto out;
+	}
+
 	if (t->parms.iph.daddr == 0)
 		goto out;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 340fcf29a966..6232d476f37e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -387,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info)
 		break;
 	case ICMP_REDIRECT:
 		/* See ICMP_SOURCE_QUENCH */
+		ipv4_sk_redirect(skb, sk);
 		err = EREMOTEIO;
 		break;
 	}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 659ddfb10947..ff0f071969ea 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -218,6 +218,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 		ipv4_sk_update_pmtu(skb, sk, info);
+	else if (type == ICMP_REDIRECT)
+		ipv4_sk_redirect(skb, sk);
 
 	/* Report error on raw socket, if:
 	   1. User requested ip_recverr.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 01545a3fc0f2..087a8488843f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -321,6 +321,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 	} /* else let the usual retransmit timer handle it */
 }
 
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+	if (dst && dst->ops->redirect)
+		dst->ops->redirect(dst, skb);
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -394,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	}
 
 	switch (type) {
+	case ICMP_REDIRECT:
+		do_redirect(icmp_skb, sk);
+		goto out;
 	case ICMP_SOURCE_QUENCH:
 		/* Just silently ignore these. */
 		goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee37d47d472e..b4c3582a991f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -630,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			err = icmp_err_convert[code].errno;
 		}
 		break;
+	case ICMP_REDIRECT:
+		ipv4_sk_redirect(skb, sk);
+		break;
 	}
 
 	/*
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 87d3fcc302d4..258ebd7b268b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -202,6 +202,15 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
 	path->ops->update_pmtu(path, mtu);
 }
 
+static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	if (path->ops->redirect)
+		path->ops->redirect(path, skb);
+}
+
 static void xfrm4_dst_destroy(struct dst_entry *dst)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
@@ -225,6 +234,7 @@ static struct dst_ops xfrm4_dst_ops = {
 	.protocol =		cpu_to_be16(ETH_P_IP),
 	.gc =			xfrm4_garbage_collect,
 	.update_pmtu =		xfrm4_update_pmtu,
+	.redirect =		xfrm4_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
 	.destroy =		xfrm4_dst_destroy,
 	.ifdown =		xfrm4_dst_ifdown,
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 80564fe03024..9fb4247f9a99 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -423,6 +423,18 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 	sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
 }
 
+static void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
+			       struct sk_buff *skb)
+{
+	struct dst_entry *dst;
+
+	if (!t)
+		return;
+	dst = sctp_transport_dst_check(t);
+	if (dst && dst->ops->redirect)
+		dst->ops->redirect(dst, skb);
+}
+
 /*
  * SCTP Implementer's Guide, 2.37 ICMP handling procedures
  *
@@ -628,6 +640,10 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 
 		err = EHOSTUNREACH;
 		break;
+	case ICMP_REDIRECT:
+		sctp_icmp_redirect(sk, transport, skb);
+		err = 0;
+		break;
 	default:
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From 1f42539d257af671d56d4bdbcf13aef31abff6ef Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 11 Jul 2012 21:30:08 -0700
Subject: ipv4: Kill ip_rt_redirect().

No longer needed, as the protocol handlers now all properly
propagate the redirect back into the routing code.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  1 -
 net/ipv4/icmp.c     |  1 -
 net/ipv4/route.c    | 44 --------------------------------------------
 3 files changed, 46 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 6ab93eeb8660..ace3cb442519 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -108,7 +108,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 
 struct in_device;
 extern int		ip_rt_init(void);
-extern void		ip_rt_redirect(struct sk_buff *skb, __be32 new_gw);
 extern void		rt_cache_flush(struct net *net, int how);
 extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 70a793559bb5..d01aeb4d492e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -763,7 +763,6 @@ static void icmp_redirect(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		return;
 
-	ip_rt_redirect(skb, icmp_hdr(skb)->un.gateway);
 	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index aabece6b729a..e98207dcd088 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1345,50 +1345,6 @@ reject_redirect:
 	;
 }
 
-/* called in rcu_read_lock() section */
-void ip_rt_redirect(struct sk_buff *skb, __be32 new_gw)
-{
-	const struct iphdr *iph = (const struct iphdr *) skb->data;
-	__be32 daddr = iph->daddr;
-	__be32 saddr = iph->saddr;
-	struct net_device *dev = skb->dev;
-	int    ikeys[2] = { dev->ifindex, 0 };
-	__be32 skeys[2] = { saddr, 0 };
-	struct net *net;
-	int s, i;
-
-	net = dev_net(dev);
-	for (s = 0; s < 2; s++) {
-		for (i = 0; i < 2; i++) {
-			unsigned int hash;
-			struct rtable __rcu **rthp;
-			struct rtable *rt;
-
-			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
-
-			rthp = &rt_hash_table[hash].chain;
-
-			while ((rt = rcu_dereference(*rthp)) != NULL) {
-				rthp = &rt->dst.rt_next;
-
-				if (rt->rt_key_dst != daddr ||
-				    rt->rt_key_src != skeys[s] ||
-				    rt->rt_oif != ikeys[i] ||
-				    rt_is_input_route(rt) ||
-				    rt_is_expired(rt) ||
-				    !net_eq(dev_net(rt->dst.dev), net) ||
-				    rt->dst.error ||
-				    rt->dst.dev != dev)
-					continue;
-
-				ip_do_redirect(&rt->dst, skb);
-			}
-		}
-	}
-	return;
-
-}
-
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *)dst;
-- 
cgit v1.2.3


From b587ee3ba21f58b7770a132e6bca5c6658ac5095 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 00:39:24 -0700
Subject: net: Add dummy dst_ops->redirect method where needed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 5 +++++
 net/decnet/dn_route.c     | 6 ++++++
 net/ipv4/route.c          | 5 +++++
 net/ipv6/route.c          | 5 +++++
 4 files changed, 21 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index b98d3d78ca7f..81f76c402cf2 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -115,6 +115,10 @@ static void fake_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
 
+static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb)
+{
+}
+
 static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
 	return NULL;
@@ -136,6 +140,7 @@ static struct dst_ops fake_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
 	.update_pmtu =		fake_update_pmtu,
+	.redirect =		fake_redirect,
 	.cow_metrics =		fake_cow_metrics,
 	.neigh_lookup =		fake_neigh_lookup,
 	.mtu =			fake_mtu,
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index b5594cc73ee1..e9c4e2e864c6 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -118,6 +118,7 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
 static void dn_dst_link_failure(struct sk_buff *);
 static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
+static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb);
 static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
 					     struct sk_buff *skb,
 					     const void *daddr);
@@ -145,6 +146,7 @@ static struct dst_ops dn_dst_ops = {
 	.negative_advice =	dn_dst_negative_advice,
 	.link_failure =		dn_dst_link_failure,
 	.update_pmtu =		dn_dst_update_pmtu,
+	.redirect =		dn_dst_redirect,
 	.neigh_lookup =		dn_dst_neigh_lookup,
 };
 
@@ -292,6 +294,10 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
+static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb)
+{
+}
+
 /*
  * When a route has been marked obsolete. (e.g. routing cache flush)
  */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e98207dcd088..23bbe29b3bba 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2591,6 +2591,10 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
 
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+{
+}
+
 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
 					  unsigned long old)
 {
@@ -2605,6 +2609,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 	.mtu			=	ipv4_blackhole_mtu,
 	.default_advmss		=	ipv4_default_advmss,
 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
+	.redirect		=	ipv4_rt_blackhole_redirect,
 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
 	.neigh_lookup		=	ipv4_neigh_lookup,
 };
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7296af144d6c..3151aabff5fd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -191,6 +191,10 @@ static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
 
+static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+{
+}
+
 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 					 unsigned long old)
 {
@@ -205,6 +209,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.mtu			=	ip6_blackhole_mtu,
 	.default_advmss		=	ip6_default_advmss,
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
+	.redirect		=	ip6_rt_blackhole_redirect,
 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
 	.neigh_lookup		=	ip6_neigh_lookup,
 };
-- 
cgit v1.2.3


From 1ed5c48f231cd00eac0b3d2350ac61e3c825063e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 00:41:25 -0700
Subject: net: Remove checks for dst_ops->redirect being NULL.

No longer necessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c         | 2 +-
 net/dccp/ipv6.c         | 2 +-
 net/ipv4/tcp_ipv4.c     | 2 +-
 net/ipv4/xfrm4_policy.c | 3 +--
 net/ipv6/ip6_tunnel.c   | 6 ++----
 net/ipv6/tcp_ipv6.c     | 2 +-
 net/sctp/input.c        | 2 +-
 7 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8f41a3190858..129ed8f74138 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -199,7 +199,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
 {
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
-	if (dst && dst->ops->redirect)
+	if (dst)
 		dst->ops->redirect(dst, skb);
 }
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index b4d7d28ce6d2..090c0800ce03 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -133,7 +133,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT) {
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
-		if (dst && dst->ops->redirect)
+		if (dst)
 			dst->ops->redirect(dst, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 087a8488843f..7a0062cb4ed0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -325,7 +325,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
 {
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
-	if (dst && dst->ops->redirect)
+	if (dst)
 		dst->ops->redirect(dst, skb);
 }
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 258ebd7b268b..737131cef375 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -207,8 +207,7 @@ static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb)
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	if (path->ops->redirect)
-		path->ops->redirect(path, skb);
+	path->ops->redirect(path, skb);
 }
 
 static void xfrm4_dst_destroy(struct dst_entry *dst)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 0b5b60ec6f4a..61d106597296 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -611,10 +611,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
 	}
-	if (rel_type == ICMP_REDIRECT) {
-		if (skb_dst(skb2)->ops->redirect)
-			skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2);
-	}
+	if (rel_type == ICMP_REDIRECT)
+		skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2);
 
 	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7249e4bb9b8a..3071f377145c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -366,7 +366,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT) {
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
-		if (dst && dst->ops->redirect)
+		if (dst)
 			dst->ops->redirect(dst,skb);
 	}
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5943b7d77ddb..f050d45faa98 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -431,7 +431,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
 	if (!t)
 		return;
 	dst = sctp_transport_dst_check(t);
-	if (dst && dst->ops->redirect)
+	if (dst)
 		dst->ops->redirect(dst, skb);
 }
 
-- 
cgit v1.2.3


From 99ee038d41ebbb442921b6d388d08e907b037dac Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 07:40:05 -0700
Subject: ipv4: Fix warnings in ip_do_redirect() for some configurations.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 23bbe29b3bba..9319bf1f8354 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,12 +1275,9 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 
 static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
 {
-	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
 	struct net_device *dev = skb->dev;
-	__be32 daddr = iph->daddr;
-	__be32 saddr = iph->saddr;
 	struct in_device *in_dev;
 	struct neighbour *n;
 	struct rtable *rt;
@@ -1336,11 +1333,16 @@ static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
 
 reject_redirect:
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-	if (IN_DEV_LOG_MARTIANS(in_dev))
+	if (IN_DEV_LOG_MARTIANS(in_dev)) {
+		const struct iphdr *iph = (const struct iphdr *) skb->data;
+		__be32 daddr = iph->daddr;
+		__be32 saddr = iph->saddr;
+
 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 				     "  Advised path = %pI4 -> %pI4\n",
 				     &old_gw, dev->name, &new_gw,
 				     &saddr, &daddr);
+	}
 #endif
 	;
 }
-- 
cgit v1.2.3


From f0a70e902f483295a8b6d74ef4393bc577b703d7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 08:06:04 -0700
Subject: ipv4: Put proper checks into icmp_socket_deliver().

All handler->err() routines expect that we've done a pskb_may_pull()
test to make sure that IP header length + 8 bytes can be safely
pulled.

Reported-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d01aeb4d492e..ea3a996de95b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -640,6 +640,12 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
 	const struct net_protocol *ipprot;
 	int protocol = iph->protocol;
 
+	/* Checkin full IP header plus 8 bytes of protocol to
+	 * avoid additional coding at protocol handlers.
+	 */
+	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+		return;
+
 	raw_icmp_error(skb, protocol, info);
 
 	rcu_read_lock();
@@ -733,12 +739,6 @@ static void icmp_unreach(struct sk_buff *skb)
 		goto out;
 	}
 
-	/* Checkin full IP header plus 8 bytes of protocol to
-	 * avoid additional coding at protocol handlers.
-	 */
-	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
-		goto out;
-
 	icmp_socket_deliver(skb, info);
 
 out:
-- 
cgit v1.2.3


From 391e5c22f5f4e55817f8ba18a08ea717ed2d4a1f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 09:39:28 -0700
Subject: ipv4: Remove tb_peers from fib_table.

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 -
 net/ipv4/fib_trie.c  | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 000c4674e18e..e91fedd22db2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -162,7 +162,6 @@ struct fib_table {
 	u32			tb_id;
 	int			tb_default;
 	int			tb_num_default;
-	struct inet_peer_base	tb_peers;
 	unsigned long		tb_data[0];
 };
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9b0f25930fbc..18cbc15b20d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1843,8 +1843,6 @@ int fib_table_flush(struct fib_table *tb)
 	if (ll && hlist_empty(&ll->list))
 		trie_leaf_remove(t, ll);
 
-	inetpeer_invalidate_tree(&tb->tb_peers);
-
 	pr_debug("trie_flush found=%d\n", found);
 	return found;
 }
@@ -1993,7 +1991,6 @@ struct fib_table *fib_trie_table(u32 id)
 	tb->tb_id = id;
 	tb->tb_default = -1;
 	tb->tb_num_default = 0;
-	inet_peer_base_init(&tb->tb_peers);
 
 	t = (struct trie *) tb->tb_data;
 	memset(t, 0, sizeof(*t));
-- 
cgit v1.2.3


From d01cb20711e3c2df41677ee270d6bdeff24e9902 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 12 Jul 2012 22:46:09 +0000
Subject: tcp: add LAST_ACK as a valid state for TSQ

Socket state LAST_ACK should allow TSQ to send additional frames,
or else we rely on incoming ACKS or timers to send them.

Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 03854abfd9d8..15a7c7bc3e58 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -838,7 +838,7 @@ static void tcp_tasklet_func(unsigned long data)
 		if (!sock_owned_by_user(sk)) {
 			if ((1 << sk->sk_state) &
 			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-			     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+			     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
 				tcp_write_xmit(sk,
 					       tcp_current_mss(sk),
 					       0, 0,
@@ -868,7 +868,7 @@ void tcp_release_cb(struct sock *sk)
 	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
 		if ((1 << sk->sk_state) &
 		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-		     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+		     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
 			tcp_write_xmit(sk,
 				       tcp_current_mss(sk),
 				       0, 0,
-- 
cgit v1.2.3


From 85b91b0339e764f7e56ff5968fa10d85451378b4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 13 Jul 2012 08:21:29 -0700
Subject: ipv4: Don't store a rule pointer in fib_result.

We only use it to fetch the rule's tclassid, so just store the
tclassid there instead.

This also decreases the size of fib_result by a full 8 bytes on
64-bit.  On 32-bits it's a wash.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    | 12 +++---------
 net/ipv4/fib_frontend.c |  8 --------
 net/ipv4/fib_rules.c    | 15 ++++++---------
 net/ipv4/route.c        |  6 ++----
 4 files changed, 11 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e91fedd22db2..5697acefeba3 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -106,12 +106,10 @@ struct fib_result {
 	unsigned char	nh_sel;
 	unsigned char	type;
 	unsigned char	scope;
+	u32		tclassid;
 	struct fib_info *fi;
 	struct fib_table *table;
 	struct list_head *fa_head;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	struct fib_rule	*r;
-#endif
 };
 
 struct fib_result_nl {
@@ -215,10 +213,6 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 extern int __net_init fib4_rules_init(struct net *net);
 extern void __net_exit fib4_rules_exit(struct net *net);
 
-#ifdef CONFIG_IP_ROUTE_CLASSID
-extern u32 fib_rules_tclass(const struct fib_result *res);
-#endif
-
 extern struct fib_table *fib_new_table(struct net *net, u32 id);
 extern struct fib_table *fib_get_table(struct net *net, u32 id);
 
@@ -229,7 +223,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res)
 {
 	if (!net->ipv4.fib_has_custom_rules) {
-		res->r = NULL;
+		res->tclassid = 0;
 		if (net->ipv4.fib_local &&
 		    !fib_table_lookup(net->ipv4.fib_local, flp, res,
 				      FIB_LOOKUP_NOREF))
@@ -289,7 +283,7 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 #endif
 	*itag = FIB_RES_NH(*res).nh_tclassid<<16;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-	rtag = fib_rules_tclass(res);
+	rtag = res->tclassid;
 	if (*itag == 0)
 		*itag = (rtag<<16);
 	*itag |= (rtag>>16);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 81f85716a894..7a31194ec633 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -169,10 +169,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (ipv4_is_multicast(addr))
 		return RTN_MULTICAST;
 
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r = NULL;
-#endif
-
 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
 	if (local_table) {
 		ret = RTN_UNICAST;
@@ -934,10 +930,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		.flowi4_scope = frn->fl_scope,
 	};
 
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r = NULL;
-#endif
-
 	frn->err = -ENOENT;
 	if (tb) {
 		local_bh_disable();
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index c06da93b0b70..a83d74e498d2 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,13 +47,6 @@ struct fib4_rule {
 #endif
 };
 
-#ifdef CONFIG_IP_ROUTE_CLASSID
-u32 fib_rules_tclass(const struct fib_result *res)
-{
-	return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
-}
-#endif
-
 int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_lookup_arg arg = {
@@ -63,8 +56,12 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 	int err;
 
 	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
-	res->r = arg.rule;
-
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (arg.rule)
+		res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+	else
+		res->tclassid = 0;
+#endif
 	return err;
 }
 EXPORT_SYMBOL_GPL(__fib_lookup);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9319bf1f8354..aad21819316d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1735,7 +1735,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-	set_class_tag(rt, fib_rules_tclass(res));
+	set_class_tag(rt, res->tclassid);
 #endif
 	set_class_tag(rt, itag);
 #endif
@@ -2353,11 +2353,9 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 	__be32 orig_saddr;
 	int orig_oif;
 
+	res.tclassid	= 0;
 	res.fi		= NULL;
 	res.table	= NULL;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r		= NULL;
-#endif
 
 	orig_daddr = fl4->daddr;
 	orig_saddr = fl4->saddr;
-- 
cgit v1.2.3


From 80d0a69fc57715dc9080c0567df1ed911b78abea Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 16 Jul 2012 03:28:06 -0700
Subject: ipv4: Add helper inet_csk_update_pmtu().

This abstracts away the call to dst_ops->update_pmtu() so that we can
transparently handle the fact that, in the future, the dst itself can
be invalidated by the PMTU update (when we have non-host routes cached
in sockets).

So we try to rebuild the socket cached route after the method
invocation if necessary.

This isn't used by SCTP because it needs to cache dsts per-transport,
and thus will need it's own local version of this helper.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  2 ++
 net/dccp/ipv4.c                    | 11 ++-------
 net/ipv4/inet_connection_sock.c    | 46 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c                | 11 ++-------
 4 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 291e7cee14e7..2cf44b4ed2e6 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -337,4 +337,6 @@ extern int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
 				      char __user *optval, int __user *optlen);
 extern int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 				      char __user *optval, unsigned int optlen);
+
+extern struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
 #endif /* _INET_CONNECTION_SOCK_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 129ed8f74138..683902fcc8ed 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -161,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	if (sk->sk_state == DCCP_LISTEN)
 		return;
 
-	/* We don't check in the destentry if pmtu discovery is forbidden
-	 * on this route. We just assume that no packet_to_big packets
-	 * are send back when pmtu discovery is not active.
-	 * There is a small race when the user changes this flag in the
-	 * route, but I think that's acceptable.
-	 */
-	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+	dst = inet_csk_update_pmtu(sk, mtu);
+	if (!dst)
 		return;
 
-	dst->ops->update_pmtu(dst, mtu);
-
 	/* Something is about to be wrong... Remember soft error
 	 * for the case, if this connection will not able to recover.
 	 */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 76825be3b643..200d21809379 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -803,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 }
 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 #endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	fl4 = &fl->u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+				   inet->inet_saddr, inet->inet_dport,
+				   inet->inet_sport, sk->sk_protocol,
+				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+	if (IS_ERR(rt))
+		rt = NULL;
+	if (rt)
+		sk_setup_caps(sk, &rt->dst);
+	rcu_read_unlock();
+
+	return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (!dst) {
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+		if (!dst)
+			goto out;
+	}
+	dst->ops->update_pmtu(dst, mtu);
+
+	dst = __sk_dst_check(sk, 0);
+	if (!dst)
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+	return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7a0062cb4ed0..b8e7e0595407 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -289,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 	if (sk->sk_state == TCP_LISTEN)
 		return;
 
-	/* We don't check in the destentry if pmtu discovery is forbidden
-	 * on this route. We just assume that no packet_to_big packets
-	 * are send back when pmtu discovery is not active.
-	 * There is a small race when the user changes this flag in the
-	 * route, but I think that's acceptable.
-	 */
-	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+	dst = inet_csk_update_pmtu(sk, mtu);
+	if (!dst)
 		return;
 
-	dst->ops->update_pmtu(dst, mtu);
-
 	/* Something is about to be wrong... Remember soft error
 	 * for the case, if this connection will not able to recover.
 	 */
-- 
cgit v1.2.3


From a6df1ae9383697c4eb1365176002f154982325ad Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Jul 2012 01:41:36 +0000
Subject: tcp: add OFO snmp counters

Add three SNMP TCP counters, to better track TCP behavior
at global stage (netstat -s), when packets are received
Out Of Order (OFO)

TCPOFOQueue : Number of packets queued in OFO queue

TCPOFODrop  : Number of packets meant to be queued in OFO
              but dropped because socket rcvbuf limit hit.

TCPOFOMerge : Number of packets in OFO that were merged with
              other packets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h | 5 ++++-
 net/ipv4/proc.c      | 3 +++
 net/ipv4/tcp_input.c | 7 +++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 2e68f5ba0389..6e4c51123828 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -233,7 +233,10 @@ enum
 	LINUX_MIB_TCPREQQFULLDOCOOKIES,		/* TCPReqQFullDoCookies */
 	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
-	LINUX_MIB_TCPRCVCOALESCE,			/* TCPRcvCoalesce */
+	LINUX_MIB_TCPRCVCOALESCE,		/* TCPRcvCoalesce */
+	LINUX_MIB_TCPOFOQUEUE,			/* TCPOFOQueue */
+	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
+	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44e4e22..dae25e7622cf 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -258,6 +258,9 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
 	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
 	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+	SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 055ac49b8b40..cc4e12f1f2f7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4397,8 +4397,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 	TCP_ECN_check_ce(tp, skb);
 
-	if (tcp_try_rmem_schedule(sk, skb->truesize)) {
-		/* TODO: should increment a counter */
+	if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
 		__kfree_skb(skb);
 		return;
 	}
@@ -4407,6 +4407,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
@@ -4460,6 +4461,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 			/* All the bits are present. Drop. */
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 			__kfree_skb(skb);
 			skb = NULL;
 			tcp_dsack_set(sk, seq, end_seq);
@@ -4498,6 +4500,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		__skb_unlink(skb1, &tp->out_of_order_queue);
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 		__kfree_skb(skb1);
 	}
 
-- 
cgit v1.2.3


From 51d7cccf07238f5236c5b9269231a30dd5f8e714 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Mon, 16 Jul 2012 04:28:49 +0000
Subject: net: make sock diag per-namespace

Before this patch sock_diag works for init_net only and dumps
information about sockets from all namespaces.

This patch expands sock_diag for all name-spaces.
It creates a netlink kernel socket for each netns and filters
data during dumping.

v2: filter accoding with netns in all places
    remove an unused variable.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Pavel Emelyanov <xemul@parallels.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h   |  1 -
 include/net/net_namespace.h |  1 +
 net/core/sock_diag.c        | 27 ++++++++++++++++++++-------
 net/ipv4/inet_diag.c        | 21 ++++++++++++++++-----
 net/ipv4/udp_diag.c         | 10 +++++++---
 net/unix/diag.c             |  9 +++++++--
 6 files changed, 51 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 6793fac5eab5..e3e395acc2fd 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -44,6 +44,5 @@ void sock_diag_save_cookie(void *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 
-extern struct sock *sock_diag_nlsk;
 #endif /* KERNEL */
 #endif
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index ac9195e6a062..ae1cd6c9ba52 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -101,6 +101,7 @@ struct net {
 	struct netns_xfrm	xfrm;
 #endif
 	struct netns_ipvs	*ipvs;
+	struct sock		*diag_nlsk;
 };
 
 
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 07a29eb34a41..9d8755e4a7a5 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -166,23 +166,36 @@ static void sock_diag_rcv(struct sk_buff *skb)
 	mutex_unlock(&sock_diag_mutex);
 }
 
-struct sock *sock_diag_nlsk;
-EXPORT_SYMBOL_GPL(sock_diag_nlsk);
-
-static int __init sock_diag_init(void)
+static int __net_init diag_net_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
 		.input	= sock_diag_rcv,
 	};
 
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+	net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG,
 					       THIS_MODULE, &cfg);
-	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
+	return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+	netlink_kernel_release(net->diag_nlsk);
+	net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+	.init = diag_net_init,
+	.exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+	return register_pernet_subsys(&diag_net_ops);
 }
 
 static void __exit sock_diag_exit(void)
 {
-	netlink_kernel_release(sock_diag_nlsk);
+	unregister_pernet_subsys(&diag_net_ops);
 }
 
 module_init(sock_diag_init);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 38064a285cca..570e61f9611f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -272,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 	int err;
 	struct sock *sk;
 	struct sk_buff *rep;
+	struct net *net = sock_net(in_skb->sk);
 
 	err = -EINVAL;
 	if (req->sdiag_family == AF_INET) {
-		sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
 				 req->id.idiag_dport, req->id.idiag_src[0],
 				 req->id.idiag_sport, req->id.idiag_if);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6) {
-		sk = inet6_lookup(&init_net, hashinfo,
+		sk = inet6_lookup(net, hashinfo,
 				  (struct in6_addr *)req->id.idiag_dst,
 				  req->id.idiag_dport,
 				  (struct in6_addr *)req->id.idiag_src,
@@ -317,7 +318,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 		nlmsg_free(rep);
 		goto out;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -724,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 {
 	int i, num;
 	int s_i, s_num;
+	struct net *net = sock_net(skb->sk);
 
 	s_i = cb->args[1];
 	s_num = num = cb->args[2];
@@ -743,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 			sk_nulls_for_each(sk, node, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
+				if (!net_eq(sock_net(sk), net))
+					continue;
+
 				if (num < s_num) {
 					num++;
 					continue;
@@ -813,6 +818,8 @@ skip_listen_ht:
 		sk_nulls_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next_normal;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -839,6 +846,8 @@ next_normal:
 
 			inet_twsk_for_each(tw, node,
 				    &head->twchain) {
+				if (!net_eq(twsk_net(tw), net))
+					continue;
 
 				if (num < s_num)
 					goto next_dying;
@@ -943,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
 static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	int hdrlen = sizeof(struct inet_diag_req);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
 	    nlmsg_len(nlh) < hdrlen)
@@ -963,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump_compat,
 			};
-			return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c);
+			return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
 		}
 	}
 
@@ -973,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
 	int hdrlen = sizeof(struct inet_diag_req_v2);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlmsg_len(h) < hdrlen)
 		return -EINVAL;
@@ -991,7 +1002,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump,
 			};
-			return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 		}
 	}
 
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index a7f86a3cd502..16d0960062be 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	int err = -EINVAL;
 	struct sock *sk;
 	struct sk_buff *rep;
+	struct net *net = sock_net(in_skb->sk);
 
 	if (req->sdiag_family == AF_INET)
-		sk = __udp4_lib_lookup(&init_net,
+		sk = __udp4_lib_lookup(net,
 				req->id.idiag_src[0], req->id.idiag_sport,
 				req->id.idiag_dst[0], req->id.idiag_dport,
 				req->id.idiag_if, tbl);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6)
-		sk = __udp6_lib_lookup(&init_net,
+		sk = __udp6_lib_lookup(net,
 				(struct in6_addr *)req->id.idiag_src,
 				req->id.idiag_sport,
 				(struct in6_addr *)req->id.idiag_dst,
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 		kfree_skb(rep);
 		goto out;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
 		struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
 	int num, s_num, slot, s_slot;
+	struct net *net = sock_net(skb->sk);
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
 		sk_nulls_for_each(sk, node, &hslot->head) {
 			struct inet_sock *inet = inet_sk(sk);
 
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
diff --git a/net/unix/diag.c b/net/unix/diag.c
index a74864eedfcd..750b13408449 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -177,6 +177,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct unix_diag_req *req;
 	int num, s_num, slot, s_slot;
+	struct net *net = sock_net(skb->sk);
 
 	req = nlmsg_data(cb->nlh);
 
@@ -192,6 +193,8 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		num = 0;
 		sk_for_each(sk, node, &unix_socket_table[slot]) {
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next;
 			if (!(req->udiag_states & (1 << sk->sk_state)))
@@ -243,6 +246,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	struct sock *sk;
 	struct sk_buff *rep;
 	unsigned int extra_len;
+	struct net *net = sock_net(in_skb->sk);
 
 	if (req->udiag_ino == 0)
 		goto out_nosk;
@@ -273,7 +277,7 @@ again:
 
 		goto again;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -287,6 +291,7 @@ out_nosk:
 static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
 	int hdrlen = sizeof(struct unix_diag_req);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlmsg_len(h) < hdrlen)
 		return -EINVAL;
@@ -295,7 +300,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		struct netlink_dump_control c = {
 			.dump = unix_diag_dump,
 		};
-		return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 	} else
 		return unix_diag_get_exact(skb, h, nlmsg_data(h));
 }
-- 
cgit v1.2.3


From 282f23c6ee343126156dd41218b22ece96d747e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Jul 2012 10:13:05 +0200
Subject: tcp: implement RFC 5961 3.2

Implement the RFC 5691 mitigation against Blind
Reset attack using RST bit.

Idea is to validate incoming RST sequence,
to match RCV.NXT value, instead of previouly accepted
window : (RCV.NXT <= SEG.SEQ < RCV.NXT+RCV.WND)

If sequence is in window but not an exact match, send
a "challenge ACK", so that the other part can resend an
RST with the appropriate sequence.

Add a new sysctl, tcp_challenge_ack_limit, to limit
number of challenge ACK sent per second.

Add a new SNMP counter to count number of challenge acks sent.
(netstat -s | grep TCPChallengeACK)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  5 +++++
 include/linux/snmp.h                   |  1 +
 include/net/tcp.h                      |  1 +
 net/ipv4/proc.c                        |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_input.c                   | 31 ++++++++++++++++++++++++++++++-
 6 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e20c17a7d34e..e1e021594cff 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -565,6 +565,11 @@ tcp_limit_output_bytes - INTEGER
 	reduce the size of individual GSO packet (64KB being the max)
 	Default: 131072
 
+tcp_challenge_ack_limit - INTEGER
+	Limits number of Challenge ACK sent per second, as recommended
+	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
+	Default: 100
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 6e4c51123828..673e0e928b2b 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -237,6 +237,7 @@ enum
 	LINUX_MIB_TCPOFOQUEUE,			/* TCPOFOQueue */
 	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
+	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 439984b9af49..85c5090bfe25 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -254,6 +254,7 @@ extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
+extern int sysctl_tcp_challenge_ack_limit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index dae25e7622cf..3e8e78f12a38 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -261,6 +261,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
 	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
 	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70730f7aeafe..3f6a1e762e9c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_challenge_ack_limit",
+		.data		= &sysctl_tcp_challenge_ack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc4e12f1f2f7..c841a8990377 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -5247,6 +5250,23 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
 
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+	/* unprotected vars, we dont care of overwrites */
+	static u32 challenge_timestamp;
+	static unsigned int challenge_count;
+	u32 now = jiffies / HZ;
+
+	if (now != challenge_timestamp) {
+		challenge_timestamp = now;
+		challenge_count = 0;
+	}
+	if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+		tcp_send_ack(sk);
+	}
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5283,7 +5303,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		tcp_reset(sk);
+		/* RFC 5961 3.2 :
+		 * If sequence number exactly matches RCV.NXT, then
+		 *     RESET the connection
+		 * else
+		 *     Send a challenge ACK
+		 */
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+			tcp_reset(sk);
+		else
+			tcp_send_challenge_ack(sk);
 		goto discard;
 	}
 
-- 
cgit v1.2.3


From 6700c2709c08d74ae2c3c29b84a30da012dbc7f1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 03:29:28 -0700
Subject: net: Pass optional SKB and SK arguments to
 dst_ops->{update_pmtu,redirect}()

This will be used so that we can compose a full flow key.

Even though we have a route in this context, we need more.  In the
future the routes will be without destination address, source address,
etc. keying.  One ipv4 route will cover entire subnets, etc.

In this environment we have to have a way to possess persistent storage
for redirects and PMTU information.  This persistent storage will exist
in the FIB tables, and that's why we'll need to be able to rebuild a
full lookup flow key here.  Using that flow key will do a fib_lookup()
and create/update the persistent entry.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |  2 +-
 include/net/dst_ops.h                   |  6 ++++--
 net/bridge/br_netfilter.c               |  6 ++++--
 net/dccp/ipv4.c                         |  2 +-
 net/dccp/ipv6.c                         |  2 +-
 net/decnet/dn_route.c                   | 12 ++++++++----
 net/ipv4/inet_connection_sock.c         |  2 +-
 net/ipv4/ip_gre.c                       |  2 +-
 net/ipv4/ipip.c                         |  2 +-
 net/ipv4/route.c                        | 21 +++++++++++++--------
 net/ipv4/tcp_ipv4.c                     |  2 +-
 net/ipv4/xfrm4_policy.c                 | 10 ++++++----
 net/ipv6/inet6_connection_sock.c        |  2 +-
 net/ipv6/ip6_tunnel.c                   |  6 +++---
 net/ipv6/route.c                        | 21 +++++++++++++--------
 net/ipv6/sit.c                          |  2 +-
 net/ipv6/tcp_ipv6.c                     |  2 +-
 net/ipv6/xfrm6_policy.c                 | 10 ++++++----
 net/netfilter/ipvs/ip_vs_xmit.c         |  4 ++--
 net/sctp/input.c                        |  2 +-
 net/sctp/transport.c                    |  2 +-
 21 files changed, 71 insertions(+), 49 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 014504d8e43c..1ca732201f33 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1397,7 +1397,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 	int e = skb_queue_empty(&priv->cm.skb_queue);
 
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	skb_queue_tail(&priv->cm.skb_queue, skb);
 	if (e)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 085931fa7ce0..d079fc61c123 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -24,8 +24,10 @@ struct dst_ops {
 					  struct net_device *dev, int how);
 	struct dst_entry *	(*negative_advice)(struct dst_entry *);
 	void			(*link_failure)(struct sk_buff *);
-	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
-	void			(*redirect)(struct dst_entry *dst, struct sk_buff *skb);
+	void			(*update_pmtu)(struct dst_entry *dst, struct sock *sk,
+					       struct sk_buff *skb, u32 mtu);
+	void			(*redirect)(struct dst_entry *dst, struct sock *sk,
+					    struct sk_buff *skb);
 	int			(*local_out)(struct sk_buff *skb);
 	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
 						struct sk_buff *skb,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 81f76c402cf2..68e8f364bbf8 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -111,11 +111,13 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 	 pppoe_proto(skb) == htons(PPP_IPV6) && \
 	 brnf_filter_pppoe_tagged)
 
-static void fake_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			     struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb)
 {
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 683902fcc8ed..ab4f44c9bb21 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -193,7 +193,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3ee0342e1cec..56840b249f3b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -134,7 +134,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
 		if (dst)
-			dst->ops->redirect(dst, skb);
+			dst->ops->redirect(dst, sk, skb);
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e9c4e2e864c6..47de90d8fe94 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -117,8 +117,10 @@ static void dn_dst_destroy(struct dst_entry *);
 static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
 static void dn_dst_link_failure(struct sk_buff *);
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb , u32 mtu);
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+			    struct sk_buff *skb);
 static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
 					     struct sk_buff *skb,
 					     const void *daddr);
@@ -266,7 +268,8 @@ static int dn_dst_gc(struct dst_ops *ops)
  * We update both the mtu and the advertised mss (i.e. the segment size we
  * advertise to the other end).
  */
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
 {
 	struct dn_route *rt = (struct dn_route *) dst;
 	struct neighbour *n = rt->n;
@@ -294,7 +297,8 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
-static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+			    struct sk_buff *skb)
 {
 }
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 200d21809379..3ea465286a39 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -840,7 +840,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
 		if (!dst)
 			goto out;
 	}
-	dst->ops->update_pmtu(dst, mtu);
+	dst->ops->update_pmtu(dst, sk, NULL, mtu);
 
 	dst = __sk_dst_check(sk, 0);
 	if (!dst)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0c3123566d76..42c44b1403c9 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -833,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		df |= (old_iph->frag_off&htons(IP_DF));
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c2d0e6d8baaf..2c2c35bace76 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -519,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		if (skb_dst(skb))
-			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 		if ((old_iph->frag_off & htons(IP_DF)) &&
 		    mtu < ntohs(old_iph->tot_len)) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index aad21819316d..b35d3bfc66cd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -148,8 +148,10 @@ static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 static void		 ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
-static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void		 ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					   struct sk_buff *skb, u32 mtu);
+static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+					struct sk_buff *skb);
 static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -1273,7 +1275,7 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
@@ -1506,7 +1508,8 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
@@ -1531,7 +1534,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			   iph->daddr, iph->saddr, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_rt_update_pmtu(&rt->dst, mtu);
+		ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
 		ip_rt_put(rt);
 	}
 }
@@ -1559,7 +1562,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_do_redirect(&rt->dst, skb);
+		ip_do_redirect(&rt->dst, NULL, skb);
 		ip_rt_put(rt);
 	}
 }
@@ -2587,11 +2590,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
 	return mtu ? : dst->dev->mtu;
 }
 
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					  struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+				       struct sk_buff *skb)
 {
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b8e7e0595407..d9caf5c07aae 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -319,7 +319,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 737131cef375..fcf7678bc009 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -194,20 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->update_pmtu(path, mtu);
+	path->ops->update_pmtu(path, sk, skb, mtu);
 }
 
-static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+			   struct sk_buff *skb)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->redirect(path, skb);
+	path->ops->redirect(path, sk, skb);
 }
 
 static void xfrm4_dst_destroy(struct dst_entry *dst)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 62539a4b2dc7..4a0c4d2d8b05 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -269,7 +269,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
 
 	if (IS_ERR(dst))
 		return NULL;
-	dst->ops->update_pmtu(dst, mtu);
+	dst->ops->update_pmtu(dst, sk, NULL, mtu);
 
 	return inet6_csk_route_socket(sk);
 }
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 61d106597296..db3284667968 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -609,10 +609,10 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (rel_info > dst_mtu(skb_dst(skb2)))
 			goto out;
 
-		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
+		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info);
 	}
 	if (rel_type == ICMP_REDIRECT)
-		skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2);
+		skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2);
 
 	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
 
@@ -952,7 +952,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len > mtu) {
 		*pmtu = mtu;
 		err = -EMSGSIZE;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a4c8d48977f..31af1ed6c1dc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -78,8 +78,10 @@ static int		 ip6_dst_gc(struct dst_ops *ops);
 static int		ip6_pkt_discard(struct sk_buff *skb);
 static int		ip6_pkt_discard_out(struct sk_buff *skb);
 static void		ip6_link_failure(struct sk_buff *skb);
-static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void		rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					   struct sk_buff *skb, u32 mtu);
+static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
+					struct sk_buff *skb);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -187,11 +189,13 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 	return mtu ? : dst->dev->mtu;
 }
 
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					 struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+				      struct sk_buff *skb)
 {
 }
 
@@ -1071,7 +1075,8 @@ static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
 {
 	struct rt6_info *rt6 = (struct rt6_info*)dst;
 
@@ -1108,7 +1113,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		ip6_rt_update_pmtu(dst, ntohl(mtu));
+		ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1136,7 +1141,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		rt6_do_redirect(dst, skb);
+		rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_redirect);
@@ -1639,7 +1644,7 @@ static int ip6_route_del(struct fib6_config *cfg)
 	return err;
 }
 
-static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	struct netevent_redirect netevent;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fbf1622fdeef..3bd1bfc01f85 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -807,7 +807,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		}
 
 		if (tunnel->parms.iph.daddr && skb_dst(skb))
-			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 		if (skb->len > mtu) {
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ecdf241cad02..c9dabdd832d7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -367,7 +367,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
 		if (dst)
-			dst->ops->redirect(dst,skb);
+			dst->ops->redirect(dst, sk, skb);
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f5a9cb8257b9..ef39812107b1 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -207,20 +207,22 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
-static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->update_pmtu(path, mtu);
+	path->ops->update_pmtu(path, sk, skb, mtu);
 }
 
-static void xfrm6_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk,
+			   struct sk_buff *skb)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->redirect(path, skb);
+	path->ops->redirect(path, sk, skb);
 }
 
 static void xfrm6_dst_destroy(struct dst_entry *dst)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 71d6ecb65926..65b616ae1716 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -797,7 +797,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 	}
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	df |= (old_iph->frag_off & htons(IP_DF));
 
@@ -913,7 +913,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 	}
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
 	    !skb_is_gso(skb)) {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index a67bc31f49fd..c201b26879a1 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -432,7 +432,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
 		return;
 	dst = sctp_transport_dst_check(t);
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index e69e1a2175a4..a6b7ee9ce28a 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -249,7 +249,7 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
 		t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
 
 	if (dst) {
-		dst->ops->update_pmtu(dst, pmtu);
+		dst->ops->update_pmtu(dst, sk, NULL, pmtu);
 
 		dst = sctp_transport_dst_check(t);
 		if (!dst)
-- 
cgit v1.2.3


From 0c24604b68fc7810d429d6c3657b6f148270e528 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Jul 2012 01:41:30 +0000
Subject: tcp: implement RFC 5961 4.2

Implement the RFC 5691 mitigation against Blind
Reset attack using SYN bit.

Section 4.2 of RFC 5961 advises to send a Challenge ACK and drop
incoming packet, instead of resetting the session.

Add a new SNMP counter to count number of challenge acks sent
in response to SYN packets.
(netstat -s | grep TCPSYNChallenge)

Remove obsolete TCPAbortOnSyn, since we no longer abort a TCP session
because of a SYN flag.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h |  2 +-
 net/ipv4/proc.c      |  2 +-
 net/ipv4/tcp_input.c | 32 +++++++++++++++-----------------
 3 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 673e0e928b2b..e5fcbd079e4a 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -208,7 +208,6 @@ enum
 	LINUX_MIB_TCPDSACKOFOSENT,		/* TCPDSACKOfoSent */
 	LINUX_MIB_TCPDSACKRECV,			/* TCPDSACKRecv */
 	LINUX_MIB_TCPDSACKOFORECV,		/* TCPDSACKOfoRecv */
-	LINUX_MIB_TCPABORTONSYN,		/* TCPAbortOnSyn */
 	LINUX_MIB_TCPABORTONDATA,		/* TCPAbortOnData */
 	LINUX_MIB_TCPABORTONCLOSE,		/* TCPAbortOnClose */
 	LINUX_MIB_TCPABORTONMEMORY,		/* TCPAbortOnMemory */
@@ -238,6 +237,7 @@ enum
 	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
 	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
+	LINUX_MIB_TCPSYNCHALLENGE,		/* TCPSYNChallenge */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3e8e78f12a38..2a5240b2ea61 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
 	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
 	SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
-	SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
 	SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
 	SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
 	SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -262,6 +261,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
 	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
 	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c841a8990377..8aaec5536111 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5270,8 +5270,8 @@ static void tcp_send_challenge_ack(struct sock *sk)
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
-static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
-			      const struct tcphdr *th, int syn_inerr)
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+				  const struct tcphdr *th, int syn_inerr)
 {
 	const u8 *hash_location;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -5323,20 +5323,22 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 	/* step 3: check security and precedence [ignored] */
 
-	/* step 4: Check for a SYN in window. */
-	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+	/* step 4: Check for a SYN
+	 * RFC 5691 4.2 : Send a challenge ack
+	 */
+	if (th->syn) {
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
-		tcp_reset(sk);
-		return -1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+		tcp_send_challenge_ack(sk);
+		goto discard;
 	}
 
-	return 1;
+	return true;
 
 discard:
 	__kfree_skb(skb);
-	return 0;
+	return false;
 }
 
 /*
@@ -5366,7 +5368,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			const struct tcphdr *th, unsigned int len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int res;
 
 	if (sk->sk_rx_dst) {
 		struct dst_entry *dst = sk->sk_rx_dst;
@@ -5555,9 +5556,8 @@ slow_path:
 	 *	Standard slow path.
 	 */
 
-	res = tcp_validate_incoming(sk, skb, th, 1);
-	if (res <= 0)
-		return -res;
+	if (!tcp_validate_incoming(sk, skb, th, 1))
+		return 0;
 
 step5:
 	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
@@ -5877,7 +5877,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int queued = 0;
-	int res;
 
 	tp->rx_opt.saw_tstamp = 0;
 
@@ -5932,9 +5931,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		return 0;
 	}
 
-	res = tcp_validate_incoming(sk, skb, th, 0);
-	if (res <= 0)
-		return -res;
+	if (!tcp_validate_incoming(sk, skb, th, 0))
+		return 0;
 
 	/* step 5: check the ACK field */
 	if (th->ack) {
-- 
cgit v1.2.3


From 4895c771c7f006b4b90f9d6b1d2210939ba57b38 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 04:19:00 -0700
Subject: ipv4: Add FIB nexthop exceptions.

In a regime where we have subnetted route entries, we need a way to
store persistent storage about destination specific learned values
such as redirects and PMTU values.

This is implemented here via nexthop exceptions.

The initial implementation is a 2048 entry hash table with relaiming
starting at chain length 5.  A more sophisticated scheme can be
devised if that proves necessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  18 ++++
 net/ipv4/fib_semantics.c |  23 +++++
 net/ipv4/route.c         | 256 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 266 insertions(+), 31 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5697acefeba3..e9ee1ca07087 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -18,6 +18,7 @@
 
 #include <net/flow.h>
 #include <linux/seq_file.h>
+#include <linux/rcupdate.h>
 #include <net/fib_rules.h>
 #include <net/inetpeer.h>
 
@@ -46,6 +47,22 @@ struct fib_config {
 
 struct fib_info;
 
+struct fib_nh_exception {
+	struct fib_nh_exception __rcu	*fnhe_next;
+	__be32				fnhe_daddr;
+	u32				fnhe_pmtu;
+	u32				fnhe_gw;
+	unsigned long			fnhe_expires;
+	unsigned long			fnhe_stamp;
+};
+
+struct fnhe_hash_bucket {
+	struct fib_nh_exception __rcu	*chain;
+};
+
+#define FNHE_HASH_SIZE		2048
+#define FNHE_RECLAIM_DEPTH	5
+
 struct fib_nh {
 	struct net_device	*nh_dev;
 	struct hlist_node	nh_hash;
@@ -63,6 +80,7 @@ struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
+	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
 /*
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d71bfbdc0bf4..1e09852df512 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
 	},
 };
 
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	int i;
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		fnhe = rcu_dereference(hash[i].chain);
+		while (fnhe) {
+			struct fib_nh_exception *next;
+			
+			next = rcu_dereference(fnhe->fnhe_next);
+			kfree(fnhe);
+
+			fnhe = next;
+		}
+	}
+	kfree(hash);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
 			dev_put(nexthop_nh->nh_dev);
+		if (nexthop_nh->nh_exceptions)
+			free_nh_exceptions(nexthop_nh);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b35d3bfc66cd..a5bd0b4acc61 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
+			     const struct iphdr *iph,
+			     int oif, u8 tos,
+			     u8 prot, u32 mark, int flow_flags)
+{
+	if (sk) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		oif = sk->sk_bound_dev_if;
+		mark = sk->sk_mark;
+		tos = RT_CONN_FLAGS(sk);
+		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+	}
+	flowi4_init_output(fl4, oif, mark, tos,
+			   RT_SCOPE_UNIVERSE, prot,
+			   flow_flags,
+			   iph->daddr, iph->saddr, 0, 0);
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	int oif = skb->dev->ifindex;
+	u8 tos = RT_TOS(iph->tos);
+	u8 prot = iph->protocol;
+	u32 mark = skb->mark;
+
+	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			   inet_sk_flowi_flags(sk),
+			   daddr, inet->inet_saddr, 0, 0);
+	rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	if (skb)
+		build_skb_flow_key(fl4, skb, sk);
+	else
+		build_sk_flow_key(fl4, sk);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+{
+	struct fib_nh_exception *fnhe, *oldest;
+
+	oldest = rcu_dereference(hash->chain);
+	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+			oldest = fnhe;
+	}
+	return oldest;
+}
+
+static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	struct fib_nh_exception *fnhe;
+	int depth;
+	u32 hval;
+
+	if (!hash) {
+		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
+						   GFP_ATOMIC);
+		if (!hash)
+			return NULL;
+	}
+
+	hval = (__force u32) daddr;
+	hval ^= (hval >> 11) ^ (hval >> 22);
+	hash += hval;
+
+	depth = 0;
+	for (fnhe = rcu_dereference(hash->chain); fnhe;
+	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+		if (fnhe->fnhe_daddr == daddr)
+			goto out;
+		depth++;
+	}
+
+	if (depth > FNHE_RECLAIM_DEPTH) {
+		fnhe = fnhe_oldest(hash + hval, daddr);
+		goto out_daddr;
+	}
+	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+	if (!fnhe)
+		return NULL;
+
+	fnhe->fnhe_next = hash->chain;
+	rcu_assign_pointer(hash->chain, fnhe);
+
+out_daddr:
+	fnhe->fnhe_daddr = daddr;
+out:
+	fnhe->fnhe_stamp = jiffies;
+	return fnhe;
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
+	struct fib_result res;
 	struct neighbour *n;
-	struct rtable *rt;
 	struct net *net;
 
 	switch (icmp_hdr(skb)->code & 7) {
@@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 		return;
 	}
 
-	rt = (struct rtable *) dst;
 	if (rt->rt_gateway != old_gw)
 		return;
 
@@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 			goto reject_redirect;
 	}
 
-	n = ipv4_neigh_lookup(dst, NULL, &new_gw);
+	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 	if (n) {
 		if (!(n->nud_state & NUD_VALID)) {
 			neigh_event_send(n, NULL);
 		} else {
+			if (fib_lookup(net, fl4, &res) == 0) {
+				struct fib_nh *nh = &FIB_RES_NH(res);
+				struct fib_nh_exception *fnhe;
+
+				spin_lock_bh(&fnhe_lock);
+				fnhe = find_or_create_fnhe(nh, fl4->daddr);
+				if (fnhe)
+					fnhe->fnhe_gw = new_gw;
+				spin_unlock_bh(&fnhe_lock);
+			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
@@ -1349,6 +1474,17 @@ reject_redirect:
 	;
 }
 
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	rt = (struct rtable *) dst;
+
+	ip_rt_build_flow_key(&fl4, sk, skb);
+	__ip_do_redirect(rt, skb, &fl4);
+}
+
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *)dst;
@@ -1508,33 +1644,51 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			      struct sk_buff *skb, u32 mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
-	struct rtable *rt = (struct rtable *) dst;
-
-	dst_confirm(dst);
+	struct fib_result res;
 
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
+	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+		struct fib_nh *nh = &FIB_RES_NH(res);
+		struct fib_nh_exception *fnhe;
+
+		spin_lock_bh(&fnhe_lock);
+		fnhe = find_or_create_fnhe(nh, fl4->daddr);
+		if (fnhe) {
+			fnhe->fnhe_pmtu = mtu;
+			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
+		}
+		spin_unlock_bh(&fnhe_lock);
+	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 }
 
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct flowi4 fl4;
+
+	ip_rt_build_flow_key(&fl4, sk, skb);
+	__ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 		      int oif, u32 mark, u8 protocol, int flow_flags)
 {
-	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags,
-			   iph->daddr, iph->saddr, 0, 0);
+	__build_flow_key(&fl4, NULL, iph, oif,
+			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
 		ip_rt_put(rt);
 	}
 }
@@ -1542,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 
 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 {
-	const struct inet_sock *inet = inet_sk(sk);
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
 
-	return ipv4_update_pmtu(skb, sock_net(sk), mtu,
-				sk->sk_bound_dev_if, sk->sk_mark,
-				inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-				inet_sk_flowi_flags(sk));
+	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
+		ip_rt_put(rt);
+	}
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
 void ipv4_redirect(struct sk_buff *skb, struct net *net,
 		   int oif, u32 mark, u8 protocol, int flow_flags)
 {
-	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
+	__build_flow_key(&fl4, NULL, iph, oif,
+			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_do_redirect(&rt->dst, NULL, skb);
+		__ip_do_redirect(rt, skb, &fl4);
 		ip_rt_put(rt);
 	}
 }
@@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
 
 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 {
-	const struct inet_sock *inet = inet_sk(sk);
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
 
-	return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
-			     sk->sk_mark,
-			     inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-			     inet_sk_flowi_flags(sk));
+	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_do_redirect(rt, skb, &fl4);
+		ip_rt_put(rt);
+	}
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 
@@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
+static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+{
+	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	struct fib_nh_exception *fnhe;
+	u32 hval;
+
+	hval = (__force u32) daddr;
+	hval ^= (hval >> 11) ^ (hval >> 22);
+
+	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+		if (fnhe->fnhe_daddr == daddr) {
+			if (fnhe->fnhe_pmtu) {
+				unsigned long expires = fnhe->fnhe_expires;
+				unsigned long diff = jiffies - expires;
+
+				if (time_before(jiffies, expires)) {
+					rt->rt_pmtu = fnhe->fnhe_pmtu;
+					dst_set_expires(&rt->dst, diff);
+				}
+			}
+			if (fnhe->fnhe_gw)
+				rt->rt_gateway = fnhe->fnhe_gw;
+			fnhe->fnhe_stamp = jiffies;
+			break;
+		}
+	}
+}
+
 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 			   const struct fib_result *res,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
 	if (fi) {
-		if (FIB_RES_GW(*res) &&
-		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-			rt->rt_gateway = FIB_RES_GW(*res);
+		struct fib_nh *nh = &FIB_RES_NH(*res);
+
+		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = nh->nh_gw;
+		if (unlikely(nh->nh_exceptions))
+			rt_bind_exception(rt, nh, fl4->daddr);
 		rt_init_metrics(rt, fl4, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
-- 
cgit v1.2.3


From d3a25c980fc231238256f8d80816367674e5caaf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 13:23:08 -0700
Subject: ipv4: Fix nexthop exception hash computation.

Need to mask it with (FNHE_HASH_SIZE - 1).

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a5bd0b4acc61..812e4447a223 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1347,6 +1347,16 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be3
 	return oldest;
 }
 
+static inline u32 fnhe_hashfun(__be32 daddr)
+{
+	u32 hval;
+
+	hval = (__force u32) daddr;
+	hval ^= (hval >> 11) ^ (hval >> 22);
+
+	return hval & (FNHE_HASH_SIZE - 1);
+}
+
 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -1361,8 +1371,7 @@ static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 da
 			return NULL;
 	}
 
-	hval = (__force u32) daddr;
-	hval ^= (hval >> 11) ^ (hval >> 22);
+	hval = fnhe_hashfun(daddr);
 	hash += hval;
 
 	depth = 0;
@@ -1890,8 +1899,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 	struct fib_nh_exception *fnhe;
 	u32 hval;
 
-	hval = (__force u32) daddr;
-	hval ^= (hval >> 11) ^ (hval >> 22);
+	hval = fnhe_hashfun(daddr);
 
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-- 
cgit v1.2.3


From 5abf7f7e0f6bdbfcac737f636497d7016d9507eb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Jul 2012 22:42:13 +0200
Subject: ipv4: fix rcu splat

free_nh_exceptions() should use rcu_dereference_protected(..., 1)
since its called after one RCU grace period.

Also add some const-ification in recent code.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c        |  4 ++--
 net/ipv4/inet_connection_sock.c |  4 ++--
 net/ipv4/route.c                | 13 +++++++------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1e09852df512..2b57d768240d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -148,11 +148,11 @@ static void free_nh_exceptions(struct fib_nh *nh)
 	for (i = 0; i < FNHE_HASH_SIZE; i++) {
 		struct fib_nh_exception *fnhe;
 
-		fnhe = rcu_dereference(hash[i].chain);
+		fnhe = rcu_dereference_protected(hash[i].chain, 1);
 		while (fnhe) {
 			struct fib_nh_exception *next;
 			
-			next = rcu_dereference(fnhe->fnhe_next);
+			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
 			kfree(fnhe);
 
 			fnhe = next;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3ea465286a39..c7a4de05ca04 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -806,8 +806,8 @@ EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 
 static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
 {
-	struct inet_sock *inet = inet_sk(sk);
-	struct ip_options_rcu *inet_opt;
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ip_options_rcu *inet_opt;
 	__be32 daddr = inet->inet_daddr;
 	struct flowi4 *fl4;
 	struct rtable *rt;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 812e4447a223..f67e70236728 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,7 +1275,7 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
 			     u8 prot, u32 mark, int flow_flags)
@@ -1294,7 +1294,8 @@ static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
 			   iph->daddr, iph->saddr, 0, 0);
 }
 
-static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+			       const struct sock *sk)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	int oif = skb->dev->ifindex;
@@ -1305,10 +1306,10 @@ static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct s
 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 }
 
-static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
-	struct ip_options_rcu *inet_opt;
+	const struct ip_options_rcu *inet_opt;
 	__be32 daddr = inet->inet_daddr;
 
 	rcu_read_lock();
@@ -1323,8 +1324,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
 	rcu_read_unlock();
 }
 
-static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
-				 struct sk_buff *skb)
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+				 const struct sk_buff *skb)
 {
 	if (skb)
 		build_skb_flow_key(fl4, skb, sk);
-- 
cgit v1.2.3


From 89d7ae34cdda4195809a5a987f697a517a2a3177 Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Tue, 17 Jul 2012 11:07:47 +0000
Subject: cipso: don't follow a NULL pointer when setsockopt() is called

As reported by Alan Cox, and verified by Lin Ming, when a user
attempts to add a CIPSO option to a socket using the CIPSO_V4_TAG_LOCAL
tag the kernel dies a terrible death when it attempts to follow a NULL
pointer (the skb argument to cipso_v4_validate() is NULL when called via
the setsockopt() syscall).

This patch fixes this by first checking to ensure that the skb is
non-NULL before using it to find the incoming network interface.  In
the unlikely case where the skb is NULL and the user attempts to add
a CIPSO option with the _TAG_LOCAL tag we return an error as this is
not something we want to allow.

A simple reproducer, kindly supplied by Lin Ming, although you must
have the CIPSO DOI #3 configure on the system first or you will be
caught early in cipso_v4_validate():

	#include <sys/types.h>
	#include <sys/socket.h>
	#include <linux/ip.h>
	#include <linux/in.h>
	#include <string.h>

	struct local_tag {
		char type;
		char length;
		char info[4];
	};

	struct cipso {
		char type;
		char length;
		char doi[4];
		struct local_tag local;
	};

	int main(int argc, char **argv)
	{
		int sockfd;
		struct cipso cipso = {
			.type = IPOPT_CIPSO,
			.length = sizeof(struct cipso),
			.local = {
				.type = 128,
				.length = sizeof(struct local_tag),
			},
		};

		memset(cipso.doi, 0, 4);
		cipso.doi[3] = 3;

		sockfd = socket(AF_INET, SOCK_DGRAM, 0);
		#define SOL_IP 0
		setsockopt(sockfd, SOL_IP, IP_OPTIONS,
			&cipso, sizeof(struct cipso));

		return 0;
	}

CC: Lin Ming <mlin@ss.pku.edu.cn>
Reported-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/cipso_ipv4.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index c48adc565e92..667c1d4ca984 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1725,8 +1725,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
 		case CIPSO_V4_TAG_LOCAL:
 			/* This is a non-standard tag that we only allow for
 			 * local connections, so if the incoming interface is
-			 * not the loopback device drop the packet. */
-			if (!(skb->dev->flags & IFF_LOOPBACK)) {
+			 * not the loopback device drop the packet. Further,
+			 * there is no legitimate reason for setting this from
+			 * userspace so reject it if skb is NULL. */
+			if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
 				err_offset = opt_iter;
 				goto validate_return_locked;
 			}
-- 
cgit v1.2.3


From e371589917011efe6ff8c7dfb4e9e81934ac5855 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Jul 2012 12:29:30 +0000
Subject: tcp: refine SYN handling in tcp_validate_incoming

Followup of commit 0c24604b68fc (tcp: implement RFC 5961 4.2)

As reported by Vijay Subramanian, we should send a challenge ACK
instead of a dup ack if a SYN flag is set on a packet received out of
window.

This permits the ratelimiting to work as intended, and to increase
correct SNMP counters.

Suggested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8aaec5536111..fdd49f1b7a52 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5296,8 +5296,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		 * an acknowledgment should be sent in reply (unless the RST
 		 * bit is set, if so drop the segment and return)".
 		 */
-		if (!th->rst)
+		if (!th->rst) {
+			if (th->syn)
+				goto syn_challenge;
 			tcp_send_dupack(sk, skb);
+		}
 		goto discard;
 	}
 
@@ -5327,6 +5330,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	 * RFC 5691 4.2 : Send a challenge ack
 	 */
 	if (th->syn) {
+syn_challenge:
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-- 
cgit v1.2.3


From eb8637cd4a0d651cf4fcc1559231facee829a0ac Mon Sep 17 00:00:00 2001
From: Saurabh <saurabh.mohan@vyatta.com>
Date: Tue, 17 Jul 2012 09:44:49 +0000
Subject: net/ipv4: VTI support rx-path hook in xfrm4_mode_tunnel.

Incorporated David and Steffen's comments.
Add hook for rx-path xfmr4_mode_tunnel for VTI tunnel module.

Signed-off-by: Saurabh Mohan <saurabh.mohan@vyatta.com>
Reviewed-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h           |  2 ++
 net/ipv4/xfrm4_mode_tunnel.c | 68 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

(limited to 'net/ipv4')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 17acbc92476d..d9509eb29b80 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1475,6 +1475,8 @@ extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_output_finish(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
+extern int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler);
+extern int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler);
 extern int xfrm6_extract_header(struct sk_buff *skb);
 extern int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb);
 extern int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ed4bf11ef9f4..ddee0a099a2c 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,6 +15,65 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
+/* Informational hook. The decap is still done here. */
+static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
+
+int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+	int ret = -EEXIST;
+	int priority = handler->priority;
+
+	mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+
+	for (pprev = &rcv_notify_handlers;
+	     (t = rcu_dereference_protected(*pprev,
+	     lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
+			break;
+		if (t->priority == priority)
+			goto err;
+
+	}
+
+	handler->next = *pprev;
+	rcu_assign_pointer(*pprev, handler);
+
+	ret = 0;
+
+err:
+	mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
+
+int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+	int ret = -ENOENT;
+
+	mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+	for (pprev = &rcv_notify_handlers;
+	     (t = rcu_dereference_protected(*pprev,
+	     lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
+			*pprev = handler->next;
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
+
 static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
 {
 	struct iphdr *inner_iph = ipip_hdr(skb);
@@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
+#define for_each_input_rcu(head, handler)	\
+	for (handler = rcu_dereference(head);	\
+	     handler != NULL;			\
+	     handler = rcu_dereference(handler->next))
+
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
+	struct xfrm_tunnel *handler;
 	int err = -EINVAL;
 
 	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto out;
 
+	for_each_input_rcu(rcv_notify_handlers, handler)
+		handler->handler(skb);
+
 	if (skb_cloned(skb) &&
 	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
 		goto out;
-- 
cgit v1.2.3


From 1181412c1a671ed4e8fb1736f17e6ec617c68059 Mon Sep 17 00:00:00 2001
From: Saurabh <saurabh.mohan@vyatta.com>
Date: Tue, 17 Jul 2012 09:44:54 +0000
Subject: net/ipv4: VTI support new module for ip_vti.

New VTI tunnel kernel module, Kconfig and Makefile changes.

Signed-off-by: Saurabh Mohan <saurabh.mohan@vyatta.com>
Reviewed-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_tunnel.h |  14 +
 net/ipv4/Kconfig          |  11 +
 net/ipv4/Makefile         |   1 +
 net/ipv4/ip_vti.c         | 956 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 982 insertions(+)
 create mode 100644 net/ipv4/ip_vti.c

(limited to 'net/ipv4')

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 16b92d008bed..5efff60b6f56 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -80,4 +80,18 @@ enum {
 
 #define IFLA_GRE_MAX	(__IFLA_GRE_MAX - 1)
 
+/* VTI-mode i_flags */
+#define VTI_ISVTI 0x0001
+
+enum {
+	IFLA_VTI_UNSPEC,
+	IFLA_VTI_LINK,
+	IFLA_VTI_IKEY,
+	IFLA_VTI_OKEY,
+	IFLA_VTI_LOCAL,
+	IFLA_VTI_REMOTE,
+	__IFLA_VTI_MAX,
+};
+
+#define IFLA_VTI_MAX	(__IFLA_VTI_MAX - 1)
 #endif /* _IF_TUNNEL_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 20f1cb5c8aba..5a19aeb86094 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -310,6 +310,17 @@ config SYN_COOKIES
 
 	  If unsure, say N.
 
+config NET_IPVTI
+	tristate "Virtual (secure) IP: tunneling"
+	select INET_TUNNEL
+	depends on INET_XFRM_MODE_TUNNEL
+	---help---
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This can be used with xfrm mode tunnel to give
+	  the notion of a secure tunnel for IPSEC and then use routing protocol
+	  on top.
+
 config INET_AH
 	tristate "IP: AH transformation"
 	select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5a23e8b37106..a677d804e53e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000000..c41b5c359936
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,956 @@
+/*
+ *	Linux NET3: IP/IP protocol decoder modified to support
+ *		    virtual tunnel interface
+ *
+ *	Authors:
+ *		Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define HASH_SIZE  16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static int vti_net_id __read_mostly;
+struct vti_net {
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel **tunnels[4];
+
+	struct net_device *fb_tunnel_dev;
+};
+
+static int vti_fb_tunnel_init(struct net_device *dev);
+static int vti_tunnel_init(struct net_device *dev);
+static void vti_tunnel_setup(struct net_device *dev);
+static void vti_dev_free(struct net_device *dev);
+static int vti_tunnel_bind_dev(struct net_device *dev);
+
+/* Locking : hash tables are protected by RCU and RTNL */
+
+#define for_each_ip_tunnel_rcu(start) \
+	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	u64	rx_packets;
+	u64	rx_bytes;
+	u64	tx_packets;
+	u64	tx_bytes;
+	struct	u64_stats_sync	syncp;
+};
+
+#define VTI_XMIT(stats1, stats2) do {				\
+	int err;						\
+	int pkt_len = skb->len;					\
+	err = dst_output(skb);					\
+	if (net_xmit_eval(err) == 0) {				\
+		u64_stats_update_begin(&(stats1)->syncp);	\
+		(stats1)->tx_bytes += pkt_len;			\
+		(stats1)->tx_packets++;				\
+		u64_stats_update_end(&(stats1)->syncp);		\
+	} else {						\
+		(stats2)->tx_errors++;				\
+		(stats2)->tx_aborted_errors++;			\
+	}							\
+} while (0)
+
+
+static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
+						 struct rtnl_link_stats64 *tot)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_bh(&tstats->syncp);
+			rx_packets = tstats->rx_packets;
+			tx_packets = tstats->tx_packets;
+			rx_bytes = tstats->rx_bytes;
+			tx_bytes = tstats->tx_bytes;
+		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+		tot->rx_packets += rx_packets;
+		tot->tx_packets += tx_packets;
+		tot->rx_bytes   += rx_bytes;
+		tot->tx_bytes   += tx_bytes;
+	}
+
+	tot->multicast = dev->stats.multicast;
+	tot->rx_crc_errors = dev->stats.rx_crc_errors;
+	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+	tot->rx_length_errors = dev->stats.rx_length_errors;
+	tot->rx_errors = dev->stats.rx_errors;
+	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+	tot->tx_dropped = dev->stats.tx_dropped;
+	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+	tot->tx_errors = dev->stats.tx_errors;
+
+	return tot;
+}
+
+static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
+					   __be32 remote, __be32 local)
+{
+	unsigned h0 = HASH(remote);
+	unsigned h1 = HASH(local);
+	struct ip_tunnel *t;
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+	for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+			return t;
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_wc[0])
+		if (t && (t->dev->flags&IFF_UP))
+			return t;
+	return NULL;
+}
+
+static struct ip_tunnel **__vti_bucket(struct vti_net *ipn,
+				       struct ip_tunnel_parm *parms)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	unsigned h = 0;
+	int prio = 0;
+
+	if (remote) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	if (local) {
+		prio |= 1;
+		h ^= HASH(local);
+	}
+	return &ipn->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel **vti_bucket(struct vti_net *ipn,
+					    struct ip_tunnel *t)
+{
+	return __vti_bucket(ipn, &t->parms);
+}
+
+static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = vti_bucket(ipn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
+			break;
+		}
+	}
+}
+
+static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
+
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+	rcu_assign_pointer(*tp, t);
+}
+
+static struct ip_tunnel *vti_tunnel_locate(struct net *net,
+					   struct ip_tunnel_parm *parms,
+					   int create)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+
+	for (tp = __vti_bucket(ipn, parms);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next) {
+		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+			return t;
+	}
+	if (!create)
+		return NULL;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else
+		strcpy(name, "vti%d");
+
+	dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
+	if (dev == NULL)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	nt = netdev_priv(dev);
+	nt->parms = *parms;
+	dev->rtnl_link_ops = &vti_link_ops;
+
+	vti_tunnel_bind_dev(dev);
+
+	if (register_netdevice(dev) < 0)
+		goto failed_free;
+
+	dev_hold(dev);
+	vti_tunnel_link(ipn, nt);
+	return nt;
+
+failed_free:
+	free_netdev(dev);
+	return NULL;
+}
+
+static void vti_tunnel_uninit(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+
+	vti_tunnel_unlink(ipn, netdev_priv(dev));
+	dev_put(dev);
+}
+
+static int vti_err(struct sk_buff *skb, u32 info)
+{
+
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 */
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	int err;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return 0;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return 0;
+		default:
+			/* All others are translated to HOST_UNREACH. */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return 0;
+		break;
+	}
+
+	err = -ENOENT;
+
+	rcu_read_lock();
+	t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+	if (t == NULL)
+		goto out;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 t->parms.link, 0, IPPROTO_IPIP, 0);
+		err = 0;
+		goto out;
+	}
+
+	err = 0;
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+/* We dont digest the packet therefore let the packet pass */
+static int vti_rcv(struct sk_buff *skb)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	rcu_read_lock();
+	tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		u64_stats_update_begin(&tstats->syncp);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+		u64_stats_update_end(&tstats->syncp);
+
+		skb->dev = tunnel->dev;
+		rcu_read_unlock();
+		return 1;
+	}
+	rcu_read_unlock();
+
+	return -1;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct pcpu_tstats *tstats;
+	struct iphdr  *tiph = &tunnel->parms.iph;
+	u8     tos;
+	struct rtable *rt;		/* Route to the other host */
+	struct net_device *tdev;	/* Device to other host */
+	struct iphdr  *old_iph = ip_hdr(skb);
+	__be32 dst = tiph->daddr;
+	struct flowi4 fl4;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto tx_error;
+
+	tos = old_iph->tos;
+
+	memset(&fl4, 0, sizeof(fl4));
+	flowi4_init_output(&fl4, tunnel->parms.link,
+			   htonl(tunnel->parms.i_key), RT_TOS(tos),
+			   RT_SCOPE_UNIVERSE,
+			   IPPROTO_IPIP, 0,
+			   dst, tiph->saddr, 0, 0);
+	rt = ip_route_output_key(dev_net(dev), &fl4);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error_icmp;
+	}
+	/* if there is no transform then this tunnel is not functional.
+	 * Or if the xfrm is not mode tunnel.
+	 */
+	if (!rt->dst.xfrm ||
+	    rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error_icmp;
+	}
+	tdev = rt->dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	nf_reset(skb);
+	skb->dev = skb_dst(skb)->dev;
+
+	tstats = this_cpu_ptr(dev->tstats);
+	VTI_XMIT(tstats, &dev->stats);
+	return NETDEV_TX_OK;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev->stats.tx_errors++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int vti_tunnel_bind_dev(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	struct iphdr *iph;
+
+	tunnel = netdev_priv(dev);
+	iph = &tunnel->parms.iph;
+
+	if (iph->daddr) {
+		struct rtable *rt;
+		struct flowi4 fl4;
+		memset(&fl4, 0, sizeof(fl4));
+		flowi4_init_output(&fl4, tunnel->parms.link,
+				   htonl(tunnel->parms.i_key),
+				   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
+				   IPPROTO_IPIP, 0,
+				   iph->daddr, iph->saddr, 0, 0);
+		rt = ip_route_output_key(dev_net(dev), &fl4);
+		if (!IS_ERR(rt)) {
+			tdev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+		dev->flags |= IFF_POINTOPOINT;
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+	if (tdev) {
+		dev->hard_header_len = tdev->hard_header_len +
+				       sizeof(struct iphdr);
+		dev->mtu = tdev->mtu;
+	}
+	dev->iflink = tunnel->parms.link;
+	return dev->mtu;
+}
+
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *t;
+	struct net *net = dev_net(dev);
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		t = NULL;
+		if (dev == ipn->fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+					   sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = vti_tunnel_locate(net, &p, 0);
+		}
+		if (t == NULL)
+			t = netdev_priv(dev);
+		memcpy(&p, &t->parms, sizeof(p));
+		p.i_flags |= GRE_KEY | VTI_ISVTI;
+		p.o_flags |= GRE_KEY;
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			err = -EFAULT;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+		    p.iph.ihl != 5)
+			goto done;
+
+		t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				if (((dev->flags&IFF_POINTOPOINT) &&
+				    !p.iph.daddr) ||
+				    (!(dev->flags&IFF_POINTOPOINT) &&
+				    p.iph.daddr)) {
+					err = -EINVAL;
+					break;
+				}
+				t = netdev_priv(dev);
+				vti_tunnel_unlink(ipn, t);
+				synchronize_net();
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				t->parms.i_key = p.i_key;
+				t->parms.o_key = p.o_key;
+				t->parms.iph.protocol = IPPROTO_IPIP;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				vti_tunnel_link(ipn, t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.i_key = p.i_key;
+				t->parms.o_key = p.o_key;
+				if (t->parms.link != p.link) {
+					t->parms.link = p.link;
+					vti_tunnel_bind_dev(dev);
+					netdev_state_change(dev);
+				}
+			}
+			p.i_flags |= GRE_KEY | VTI_ISVTI;
+			p.o_flags |= GRE_KEY;
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
+					 sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == ipn->fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+					   sizeof(p)))
+				goto done;
+			err = -ENOENT;
+
+			t = vti_tunnel_locate(net, &p, 0);
+			if (t == NULL)
+				goto done;
+			err = -EPERM;
+			if (t->dev == ipn->fb_tunnel_dev)
+				goto done;
+			dev = t->dev;
+		}
+		unregister_netdevice(dev);
+		err = 0;
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < 68 || new_mtu > 0xFFF8)
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+	.ndo_init	= vti_tunnel_init,
+	.ndo_uninit	= vti_tunnel_uninit,
+	.ndo_start_xmit	= vti_tunnel_xmit,
+	.ndo_do_ioctl	= vti_tunnel_ioctl,
+	.ndo_change_mtu	= vti_tunnel_change_mtu,
+	.ndo_get_stats64 = vti_get_stats64,
+};
+
+static void vti_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &vti_netdev_ops;
+	dev->destructor		= vti_dev_free;
+
+	dev->type		= ARPHRD_TUNNEL;
+	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr);
+	dev->mtu		= ETH_DATA_LEN;
+	dev->flags		= IFF_NOARP;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+	struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_IPIP;
+	iph->ihl		= 5;
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	dev_hold(dev);
+	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+	return 0;
+}
+
+static struct xfrm_tunnel vti_handler __read_mostly = {
+	.handler	=	vti_rcv,
+	.err_handler	=	vti_err,
+	.priority	=	1,
+};
+
+static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
+{
+	int prio;
+
+	for (prio = 1; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(ipn->tunnels[prio][h]);
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = rtnl_dereference(t->next);
+			}
+		}
+	}
+}
+
+static int __net_init vti_init_net(struct net *net)
+{
+	int err;
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+
+	ipn->tunnels[0] = ipn->tunnels_wc;
+	ipn->tunnels[1] = ipn->tunnels_l;
+	ipn->tunnels[2] = ipn->tunnels_r;
+	ipn->tunnels[3] = ipn->tunnels_r_l;
+
+	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+					  "ip_vti0",
+					  vti_tunnel_setup);
+	if (!ipn->fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err_alloc_dev;
+	}
+	dev_net_set(ipn->fb_tunnel_dev, net);
+
+	err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
+	if (err)
+		goto err_reg_dev;
+	ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
+
+	err = register_netdev(ipn->fb_tunnel_dev);
+	if (err)
+		goto err_reg_dev;
+	return 0;
+
+err_reg_dev:
+	vti_dev_free(ipn->fb_tunnel_dev);
+err_alloc_dev:
+	/* nothing */
+	return err;
+}
+
+static void __net_exit vti_exit_net(struct net *net)
+{
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	vti_destroy_tunnels(ipn, &list);
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations vti_net_ops = {
+	.init = vti_init_net,
+	.exit = vti_exit_net,
+	.id   = &vti_net_id,
+	.size = sizeof(struct vti_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+			      struct ip_tunnel_parm *parms)
+{
+	memset(parms, 0, sizeof(*parms));
+
+	parms->iph.protocol = IPPROTO_IPIP;
+
+	if (!data)
+		return;
+
+	if (data[IFLA_VTI_LINK])
+		parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+	if (data[IFLA_VTI_IKEY])
+		parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+	if (data[IFLA_VTI_OKEY])
+		parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+	if (data[IFLA_VTI_LOCAL])
+		parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
+
+	if (data[IFLA_VTI_REMOTE])
+		parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
+
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+		       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ip_tunnel *nt;
+	struct net *net = dev_net(dev);
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+	int mtu;
+	int err;
+
+	nt = netdev_priv(dev);
+	vti_netlink_parms(data, &nt->parms);
+
+	if (vti_tunnel_locate(net, &nt->parms, 0))
+		return -EEXIST;
+
+	mtu = vti_tunnel_bind_dev(dev);
+	if (!tb[IFLA_MTU])
+		dev->mtu = mtu;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto out;
+
+	dev_hold(dev);
+	vti_tunnel_link(ipn, nt);
+
+out:
+	return err;
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+			  struct nlattr *data[])
+{
+	struct ip_tunnel *t, *nt;
+	struct net *net = dev_net(dev);
+	struct vti_net *ipn = net_generic(net, vti_net_id);
+	struct ip_tunnel_parm p;
+	int mtu;
+
+	if (dev == ipn->fb_tunnel_dev)
+		return -EINVAL;
+
+	nt = netdev_priv(dev);
+	vti_netlink_parms(data, &p);
+
+	t = vti_tunnel_locate(net, &p, 0);
+
+	if (t) {
+		if (t->dev != dev)
+			return -EEXIST;
+	} else {
+		t = nt;
+
+		vti_tunnel_unlink(ipn, t);
+		t->parms.iph.saddr = p.iph.saddr;
+		t->parms.iph.daddr = p.iph.daddr;
+		t->parms.i_key = p.i_key;
+		t->parms.o_key = p.o_key;
+		if (dev->type != ARPHRD_ETHER) {
+			memcpy(dev->dev_addr, &p.iph.saddr, 4);
+			memcpy(dev->broadcast, &p.iph.daddr, 4);
+		}
+		vti_tunnel_link(ipn, t);
+		netdev_state_change(dev);
+	}
+
+	if (t->parms.link != p.link) {
+		t->parms.link = p.link;
+		mtu = vti_tunnel_bind_dev(dev);
+		if (!tb[IFLA_MTU])
+			dev->mtu = mtu;
+		netdev_state_change(dev);
+	}
+
+	return 0;
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_VTI_LINK */
+		nla_total_size(4) +
+		/* IFLA_VTI_IKEY */
+		nla_total_size(4) +
+		/* IFLA_VTI_OKEY */
+		nla_total_size(4) +
+		/* IFLA_VTI_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_VTI_REMOTE */
+		nla_total_size(4) +
+		0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm *p = &t->parms;
+
+	nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+	nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+	nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+	nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+	nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+
+	return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+	[IFLA_VTI_LINK]		= { .type = NLA_U32 },
+	[IFLA_VTI_IKEY]		= { .type = NLA_U32 },
+	[IFLA_VTI_OKEY]		= { .type = NLA_U32 },
+	[IFLA_VTI_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VTI_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+	.kind		= "vti",
+	.maxtype	= IFLA_VTI_MAX,
+	.policy		= vti_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= vti_tunnel_setup,
+	.validate	= vti_tunnel_validate,
+	.newlink	= vti_newlink,
+	.changelink	= vti_changelink,
+	.get_size	= vti_get_size,
+	.fill_info	= vti_fill_info,
+};
+
+static int __init vti_init(void)
+{
+	int err;
+
+	pr_info("IPv4 over IPSec tunneling driver\n");
+
+	err = register_pernet_device(&vti_net_ops);
+	if (err < 0)
+		return err;
+	err = xfrm4_mode_tunnel_input_register(&vti_handler);
+	if (err < 0) {
+		unregister_pernet_device(&vti_net_ops);
+		pr_info(KERN_INFO "vti init: can't register tunnel\n");
+	}
+
+	err = rtnl_link_register(&vti_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+	return err;
+
+rtnl_link_failed:
+	xfrm4_mode_tunnel_input_deregister(&vti_handler);
+	unregister_pernet_device(&vti_net_ops);
+	return err;
+}
+
+static void __exit vti_fini(void)
+{
+	rtnl_link_unregister(&vti_link_ops);
+	if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
+		pr_info("vti close: can't deregister tunnel\n");
+
+	unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
-- 
cgit v1.2.3


From ddbe503203855939946430e39bae58de11b70b69 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Jul 2012 08:11:12 +0000
Subject: ipv6: add ipv6_addr_hash() helper

Introduce ipv6_addr_hash() helper doing a XOR on all bits
of an IPv6 address, with an optimized x86_64 version.

Use it in flow dissector, as suggested by Andrew McGregor,
to reduce hash collision probabilities in fq_codel (and other
users of flow dissector)

Use it in ip6_tunnel.c and use more bit shuffling, as suggested
by David Laight, as existing hash was ignoring most of them.

Use it in sunrpc and use more bit shuffling, using hash_32().

Use it in net/ipv6/addrconf.c, using hash_32() as well.

As a cleanup, use it in net/ipv4/tcp_metrics.c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrew McGregor <andrewmcgr@gmail.com>
Cc: Dave Taht <dave.taht@gmail.com>
Cc: Tom Herbert <therbert@google.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h    |  3 ++-
 include/net/ipv6.h        | 13 +++++++++++++
 net/core/flow_dissector.c |  5 +++--
 net/ipv4/tcp_metrics.c    | 15 +++------------
 net/ipv6/addrconf.c       | 21 ++++++++-------------
 net/ipv6/ip6_tunnel.c     | 20 ++++++++++++--------
 net/sunrpc/svcauth_unix.c | 22 ++++------------------
 7 files changed, 45 insertions(+), 54 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f2b801c4b555..089a09d001d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -46,7 +46,8 @@ struct prefix_info {
 #include <net/if_inet6.h>
 #include <net/ipv6.h>
 
-#define IN6_ADDR_HSIZE		16
+#define IN6_ADDR_HSIZE_SHIFT	4
+#define IN6_ADDR_HSIZE		(1 << IN6_ADDR_HSIZE_SHIFT)
 
 extern int			addrconf_init(void);
 extern void			addrconf_cleanup(void);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f695f39e8926..01c34b363a34 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -419,6 +419,19 @@ static inline bool ipv6_addr_any(const struct in6_addr *a)
 #endif
 }
 
+static inline u32 ipv6_addr_hash(const struct in6_addr *a)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	const unsigned long *ul = (const unsigned long *)a;
+	unsigned long x = ul[0] ^ ul[1];
+
+	return (u32)(x ^ (x >> 32));
+#else
+	return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
+			     a->s6_addr32[2] ^ a->s6_addr32[3]);
+#endif
+}
+
 static inline bool ipv6_addr_loopback(const struct in6_addr *a)
 {
 	return (a->s6_addr32[0] | a->s6_addr32[1] |
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a225089df5b6..466820b6e344 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -4,6 +4,7 @@
 #include <linux/ipv6.h>
 #include <linux/if_vlan.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
@@ -55,8 +56,8 @@ ipv6:
 			return false;
 
 		ip_proto = iph->nexthdr;
-		flow->src = iph->saddr.s6_addr32[3];
-		flow->dst = iph->daddr.s6_addr32[3];
+		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 		nhoff += sizeof(struct ipv6hdr);
 		break;
 	}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 5a38a2d5a95b..1a115b665792 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -211,10 +211,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 		break;
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
 		break;
 	default:
 		return NULL;
@@ -251,10 +248,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 	case AF_INET6:
 		tw6 = inet6_twsk((struct sock *)tw);
 		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
 		break;
 	default:
 		return NULL;
@@ -291,10 +285,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		break;
 	case AF_INET6:
 		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
-		hash = ((__force unsigned int) addr.addr.a6[0] ^
-			(__force unsigned int) addr.addr.a6[1] ^
-			(__force unsigned int) addr.addr.a6[2] ^
-			(__force unsigned int) addr.addr.a6[3]);
+		hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
 		break;
 	default:
 		return NULL;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8f6411c97189..79181819a24f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -63,6 +63,7 @@
 #include <linux/delay.h>
 #include <linux/notifier.h>
 #include <linux/string.h>
+#include <linux/hash.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -579,15 +580,9 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
 	list_add_tail(&ifp->if_list, p);
 }
 
-static u32 ipv6_addr_hash(const struct in6_addr *addr)
+static u32 inet6_addr_hash(const struct in6_addr *addr)
 {
-	/*
-	 * We perform the hash function over the last 64 bits of the address
-	 * This will include the IEEE address token on links that support it.
-	 */
-	return jhash_2words((__force u32)addr->s6_addr32[2],
-			    (__force u32)addr->s6_addr32[3], 0)
-		& (IN6_ADDR_HSIZE - 1);
+	return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT);
 }
 
 /* On success it returns ifp with increased reference count */
@@ -662,7 +657,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	in6_ifa_hold(ifa);
 
 	/* Add to big hash table */
-	hash = ipv6_addr_hash(addr);
+	hash = inet6_addr_hash(addr);
 
 	hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
 	spin_unlock(&addrconf_hash_lock);
@@ -1270,7 +1265,7 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 {
 	struct inet6_ifaddr *ifp;
 	struct hlist_node *node;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) {
@@ -1293,7 +1288,7 @@ EXPORT_SYMBOL(ipv6_chk_addr);
 static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
 			       struct net_device *dev)
 {
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 	struct inet6_ifaddr *ifp;
 	struct hlist_node *node;
 
@@ -1336,7 +1331,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
 				     struct net_device *dev, int strict)
 {
 	struct inet6_ifaddr *ifp, *result = NULL;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 	struct hlist_node *node;
 
 	rcu_read_lock_bh();
@@ -3223,7 +3218,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
 	int ret = 0;
 	struct inet6_ifaddr *ifp = NULL;
 	struct hlist_node *n;
-	unsigned int hash = ipv6_addr_hash(addr);
+	unsigned int hash = inet6_addr_hash(addr);
 
 	rcu_read_lock_bh();
 	hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index db3284667968..9a1d5fe6aef8 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -40,6 +40,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/netfilter_ipv6.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
@@ -70,11 +71,15 @@ MODULE_ALIAS_NETDEV("ip6tnl0");
 #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
 #define IPV6_TCLASS_SHIFT 20
 
-#define HASH_SIZE  32
+#define HASH_SIZE_SHIFT  5
+#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
 
-#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
-		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
-		    (HASH_SIZE - 1))
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+{
+	u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
+
+	return hash_32(hash, HASH_SIZE_SHIFT);
+}
 
 static int ip6_tnl_dev_init(struct net_device *dev);
 static void ip6_tnl_dev_setup(struct net_device *dev);
@@ -166,12 +171,11 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
 static struct ip6_tnl *
 ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
 {
-	unsigned int h0 = HASH(remote);
-	unsigned int h1 = HASH(local);
+	unsigned int hash = HASH(remote, local);
 	struct ip6_tnl *t;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
-	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) {
+	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
 		if (ipv6_addr_equal(local, &t->parms.laddr) &&
 		    ipv6_addr_equal(remote, &t->parms.raddr) &&
 		    (t->dev->flags & IFF_UP))
@@ -205,7 +209,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p)
 
 	if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
 		prio = 1;
-		h = HASH(remote) ^ HASH(local);
+		h = HASH(remote, local);
 	}
 	return &ip6n->tnls[prio][h];
 }
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2777fa896645..4d0129203733 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -104,23 +104,9 @@ static void ip_map_put(struct kref *kref)
 	kfree(im);
 }
 
-#if IP_HASHBITS == 8
-/* hash_long on a 64 bit machine is currently REALLY BAD for
- * IP addresses in reverse-endian (i.e. on a little-endian machine).
- * So use a trivial but reliable hash instead
- */
-static inline int hash_ip(__be32 ip)
-{
-	int hash = (__force u32)ip ^ ((__force u32)ip>>16);
-	return (hash ^ (hash>>8)) & 0xff;
-}
-#endif
-static inline int hash_ip6(struct in6_addr ip)
+static inline int hash_ip6(const struct in6_addr *ip)
 {
-	return (hash_ip(ip.s6_addr32[0]) ^
-		hash_ip(ip.s6_addr32[1]) ^
-		hash_ip(ip.s6_addr32[2]) ^
-		hash_ip(ip.s6_addr32[3]));
+	return hash_32(ipv6_addr_hash(ip), IP_HASHBITS);
 }
 static int ip_map_match(struct cache_head *corig, struct cache_head *cnew)
 {
@@ -301,7 +287,7 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
 	ip.m_addr = *addr;
 	ch = sunrpc_cache_lookup(cd, &ip.h,
 				 hash_str(class, IP_HASHBITS) ^
-				 hash_ip6(*addr));
+				 hash_ip6(addr));
 
 	if (ch)
 		return container_of(ch, struct ip_map, h);
@@ -331,7 +317,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
 	ip.h.expiry_time = expiry;
 	ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
 				 hash_str(ipm->m_class, IP_HASHBITS) ^
-				 hash_ip6(ipm->m_addr));
+				 hash_ip6(&ipm->m_addr));
 	if (!ch)
 		return -ENOMEM;
 	cache_put(ch, cd);
-- 
cgit v1.2.3


From 6255e5ead00cf96554f623ba51e2ac4c8ac27276 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 18 Jul 2012 21:34:24 +0000
Subject: ipv4: optimize fib_compute_spec_dst call in ip_options_echo

Move fib_compute_spec_dst at the only place where it
is needed.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_options.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index a19d6471a318..1dc01f9793d5 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -93,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 	unsigned char *sptr, *dptr;
 	int soffset, doffset;
 	int	optlen;
-	__be32	daddr;
 
 	memset(dopt, 0, sizeof(struct ip_options));
 
@@ -105,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
 
-	daddr = fib_compute_spec_dst(skb);
-
 	if (sopt->rr) {
 		optlen  = sptr[sopt->rr+1];
 		soffset = sptr[sopt->rr+2];
@@ -180,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 				doffset -= 4;
 		}
 		if (doffset > 3) {
+			__be32 daddr = fib_compute_spec_dst(skb);
+
 			memcpy(&start[doffset-1], &daddr, 4);
 			dopt->faddr = faddr;
 			dptr[0] = start[0];
-- 
cgit v1.2.3


From 0cc535a29916c6a0e6e6af0f3d42c2fe3b0b145d Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 18 Jul 2012 21:35:03 +0000
Subject: ipv4: fix address selection in fib_compute_spec_dst

ip_options_compile can be called for forwarded packets,
make sure the specific-destionation address is a local one as
specified in RFC 1812, 4.2.2.2 Addresses in Options

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7a31194ec633..b83203658ee3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -206,7 +206,8 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	int scope;
 
 	rt = skb_rtable(skb);
-	if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)))
+	if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+	    RTCF_LOCAL)
 		return ip_hdr(skb)->daddr;
 
 	in_dev = __in_dev_get_rcu(dev);
-- 
cgit v1.2.3


From 7fed84f622ec2696087301199c2952b85e0cc3b4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 19 Jul 2012 08:46:59 -0700
Subject: ipv4: Fix time difference calculation in rt_bind_exception().

Reported-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e70236728..2c25581bf25c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1907,7 +1907,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 		if (fnhe->fnhe_daddr == daddr) {
 			if (fnhe->fnhe_pmtu) {
 				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
+				unsigned long diff = expires - jiffies;
 
 				if (time_before(jiffies, expires)) {
 					rt->rt_pmtu = fnhe->fnhe_pmtu;
-- 
cgit v1.2.3


From aee06da6726d4981c51928c2d6d1e2cabeec7a10 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 18 Jul 2012 10:15:35 +0000
Subject: ipv4: use seqlock for nh_exceptions

Use global seqlock for the nh_exceptions. Call
fnhe_oldest with the right hash chain. Correct the diff
value for dst_set_expires.

v2: after suggestions from Eric Dumazet:
* get rid of spin lock fnhe_lock, rearrange update_or_create_fnhe
* continue daddr search in rt_bind_exception

v3:
* remove the daddr check before seqlock in rt_bind_exception
* restart lookup in rt_bind_exception on detected seqlock change,
as suggested by David Miller

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h |   2 +-
 net/ipv4/route.c     | 118 +++++++++++++++++++++++++++++----------------------
 2 files changed, 69 insertions(+), 51 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e9ee1ca07087..2daf096dfc60 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -51,7 +51,7 @@ struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
-	u32				fnhe_gw;
+	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	unsigned long			fnhe_stamp;
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2c25581bf25c..89e39dc5336b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1333,9 +1333,9 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 		build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SPINLOCK(fnhe_lock);
+static DEFINE_SEQLOCK(fnhe_seqlock);
 
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
 
@@ -1358,47 +1358,63 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 	return hval & (FNHE_HASH_SIZE - 1);
 }
 
-static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+				  u32 pmtu, unsigned long expires)
 {
-	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
 	int depth;
-	u32 hval;
+	u32 hval = fnhe_hashfun(daddr);
+
+	write_seqlock_bh(&fnhe_seqlock);
 
+	hash = nh->nh_exceptions;
 	if (!hash) {
-		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
-						   GFP_ATOMIC);
+		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 		if (!hash)
-			return NULL;
+			goto out_unlock;
+		nh->nh_exceptions = hash;
 	}
 
-	hval = fnhe_hashfun(daddr);
 	hash += hval;
 
 	depth = 0;
 	for (fnhe = rcu_dereference(hash->chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 		if (fnhe->fnhe_daddr == daddr)
-			goto out;
+			break;
 		depth++;
 	}
 
-	if (depth > FNHE_RECLAIM_DEPTH) {
-		fnhe = fnhe_oldest(hash + hval, daddr);
-		goto out_daddr;
+	if (fnhe) {
+		if (gw)
+			fnhe->fnhe_gw = gw;
+		if (pmtu) {
+			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_expires = expires;
+		}
+	} else {
+		if (depth > FNHE_RECLAIM_DEPTH)
+			fnhe = fnhe_oldest(hash);
+		else {
+			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+			if (!fnhe)
+				goto out_unlock;
+
+			fnhe->fnhe_next = hash->chain;
+			rcu_assign_pointer(hash->chain, fnhe);
+		}
+		fnhe->fnhe_daddr = daddr;
+		fnhe->fnhe_gw = gw;
+		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_expires = expires;
 	}
-	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
-	if (!fnhe)
-		return NULL;
-
-	fnhe->fnhe_next = hash->chain;
-	rcu_assign_pointer(hash->chain, fnhe);
 
-out_daddr:
-	fnhe->fnhe_daddr = daddr;
-out:
 	fnhe->fnhe_stamp = jiffies;
-	return fnhe;
+
+out_unlock:
+	write_sequnlock_bh(&fnhe_seqlock);
+	return;
 }
 
 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
@@ -1452,13 +1468,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		} else {
 			if (fib_lookup(net, fl4, &res) == 0) {
 				struct fib_nh *nh = &FIB_RES_NH(res);
-				struct fib_nh_exception *fnhe;
 
-				spin_lock_bh(&fnhe_lock);
-				fnhe = find_or_create_fnhe(nh, fl4->daddr);
-				if (fnhe)
-					fnhe->fnhe_gw = new_gw;
-				spin_unlock_bh(&fnhe_lock);
+				update_or_create_fnhe(nh, fl4->daddr, new_gw,
+						      0, 0);
 			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
@@ -1663,15 +1675,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
-		struct fib_nh_exception *fnhe;
 
-		spin_lock_bh(&fnhe_lock);
-		fnhe = find_or_create_fnhe(nh, fl4->daddr);
-		if (fnhe) {
-			fnhe->fnhe_pmtu = mtu;
-			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
-		}
-		spin_unlock_bh(&fnhe_lock);
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+				      jiffies + ip_rt_mtu_expires);
 	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
@@ -1902,23 +1908,35 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 
 	hval = fnhe_hashfun(daddr);
 
+restart:
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = expires - jiffies;
-
-				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
-					dst_set_expires(&rt->dst, diff);
-				}
+		__be32 fnhe_daddr, gw;
+		unsigned long expires;
+		unsigned int seq;
+		u32 pmtu;
+
+		seq = read_seqbegin(&fnhe_seqlock);
+		fnhe_daddr = fnhe->fnhe_daddr;
+		gw = fnhe->fnhe_gw;
+		pmtu = fnhe->fnhe_pmtu;
+		expires = fnhe->fnhe_expires;
+		if (read_seqretry(&fnhe_seqlock, seq))
+			goto restart;
+		if (daddr != fnhe_daddr)
+			continue;
+		if (pmtu) {
+			unsigned long diff = jiffies - expires;
+
+			if (time_before(jiffies, expires)) {
+				rt->rt_pmtu = pmtu;
+				dst_set_expires(&rt->dst, diff);
 			}
-			if (fnhe->fnhe_gw)
-				rt->rt_gateway = fnhe->fnhe_gw;
-			fnhe->fnhe_stamp = jiffies;
-			break;
 		}
+		if (gw)
+			rt->rt_gateway = gw;
+		fnhe->fnhe_stamp = jiffies;
+		break;
 	}
 }
 
-- 
cgit v1.2.3


From be9f4a44e7d41cee50ddb5f038fc2391cbbb4046 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Jul 2012 07:34:03 +0000
Subject: ipv4: tcp: remove per net tcp_sock

tcp_v4_send_reset() and tcp_v4_send_ack() use a single socket
per network namespace.

This leads to bad behavior on multiqueue NICS, because many cpus
contend for the socket lock and once socket lock is acquired, extra
false sharing on various socket fields slow down the operations.

To better resist to attacks, we use a percpu socket. Each cpu can
run without contention, using appropriate memory (local node)

Additional features :

1) We also mirror the queue_mapping of the incoming skb, so that
answers use the same queue if possible.

2) Setting SOCK_USE_WRITE_QUEUE socket flag speedup sock_wfree()

3) We now limit the number of in-flight RST/ACK [1] packets
per cpu, instead of per namespace, and we honor the sysctl_wmem_default
limit dynamically. (Prior to this patch, sysctl_wmem_default value was
copied at boot time, so any further change would not affect tcp_sock
limit)

[1] These packets are only generated when no socket was matched for
the incoming packet.

Reported-by: Bill Sommerfeld <wsommerfeld@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  2 +-
 include/net/netns/ipv4.h |  1 -
 net/ipv4/ip_output.c     | 50 +++++++++++++++++++++++++++++++-----------------
 net/ipv4/tcp_ipv4.c      |  8 +++-----
 4 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index ec5cfde85e9a..bd5e444a19ce 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -158,7 +158,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			   __be32 saddr, const struct ip_reply_arg *arg,
 			   unsigned int len);
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2e089a99d603..d909c7fc3da1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -38,7 +38,6 @@ struct netns_ipv4 {
 	struct sock		*fibnl;
 
 	struct sock		**icmp_sk;
-	struct sock		*tcp_sock;
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_mask;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc52679790b2..c528f841ca4b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 
 /*
  *	Generic function to send a packet as reply to another packet.
- *	Used to send TCP resets so far.
+ *	Used to send some TCP resets/acks so far.
  *
- *	Should run single threaded per socket because it uses the sock
- *     	structure to pass arguments.
+ *	Use a fake percpu inet socket to avoid false sharing and contention.
  */
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+	.sk = {
+		.__sk_common = {
+			.skc_refcnt = ATOMIC_INIT(1),
+		},
+		.sk_wmem_alloc	= ATOMIC_INIT(1),
+		.sk_allocation	= GFP_ATOMIC,
+		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
+	},
+	.pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
 			   __be32 saddr, const struct ip_reply_arg *arg,
 			   unsigned int len)
 {
-	struct inet_sock *inet = inet_sk(sk);
 	struct ip_options_data replyopts;
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	struct rtable *rt = skb_rtable(skb);
+	struct sk_buff *nskb;
+	struct sock *sk;
+	struct inet_sock *inet;
 
 	if (ip_options_echo(&replyopts.opt.opt, skb))
 		return;
@@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
 
 	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
 			   RT_TOS(arg->tos),
-			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
 			   daddr, saddr,
 			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-	rt = ip_route_output_key(sock_net(sk), &fl4);
+	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		return;
 
-	/* And let IP do all the hard work.
+	inet = &get_cpu_var(unicast_sock);
 
-	   This chunk is not reenterable, hence spinlock.
-	   Note that it uses the fact, that this function is called
-	   with locally disabled BH and that sk cannot be already spinlocked.
-	 */
-	bh_lock_sock(sk);
 	inet->tos = arg->tos;
+	sk = &inet->sk;
 	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
+	sock_net_set(sk, net);
+	__skb_queue_head_init(&sk->sk_write_queue);
+	sk->sk_sndbuf = sysctl_wmem_default;
 	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
 		       &ipc, &rt, MSG_DONTWAIT);
-	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+	nskb = skb_peek(&sk->sk_write_queue);
+	if (nskb) {
 		if (arg->csumoffset >= 0)
-			*((__sum16 *)skb_transport_header(skb) +
-			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
+			*((__sum16 *)skb_transport_header(nskb) +
+			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
-		skb->ip_summed = CHECKSUM_NONE;
+		nskb->ip_summed = CHECKSUM_NONE;
+		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 		ip_push_pending_frames(sk, &fl4);
 	}
 
-	bh_unlock_sock(sk);
+	put_cpu_var(unicast_sock);
 
 	ip_rt_put(rt);
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9caf5c07aae..d7d2fa50f07f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -688,7 +688,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 
 	net = dev_net(skb_dst(skb)->dev);
 	arg.tos = ip_hdr(skb)->tos;
-	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -771,7 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
-	ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2624,13 +2624,11 @@ EXPORT_SYMBOL(tcp_prot);
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
-				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+	return 0;
 }
 
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
-- 
cgit v1.2.3


From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:05 +0000
Subject: net-tcp: Fast Open base

This patch impelements the common code for both the client and server.

1. TCP Fast Open option processing. Since Fast Open does not have an
   option number assigned by IANA yet, it shares the experiment option
   code 254 by implementing draft-ietf-tcpm-experimental-options
   with a 16 bits magic number 0xF989. This enables global experiments
   without clashing the scarce(2) experimental options available for TCP.

   When the draft status becomes standard (maybe), the client should
   switch to the new option number assigned while the server supports
   both numbers for transistion.

2. The new sysctl tcp_fastopen

3. A place holder init function

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        | 10 ++++++++++
 include/net/tcp.h          |  9 ++++++++-
 net/ipv4/Makefile          |  2 +-
 net/ipv4/syncookies.c      |  2 +-
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 net/ipv4/tcp_fastopen.c    | 11 +++++++++++
 net/ipv4/tcp_input.c       | 26 ++++++++++++++++++++++----
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_minisocks.c   |  4 ++--
 net/ipv4/tcp_output.c      | 25 +++++++++++++++++++++----
 net/ipv6/syncookies.c      |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 12 files changed, 86 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp_fastopen.c

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1888169e07c7..12948f543839 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 	return (tcp_hdr(skb)->doff - 5) * 4;
 }
 
+/* TCP Fast Open */
+#define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */
+#define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */
+
+/* TCP Fast Open Cookie as stored in memory */
+struct tcp_fastopen_cookie {
+	s8	len;
+	u8	val[TCP_FASTOPEN_COOKIE_MAX];
+};
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block_wire {
 	__be32	start_seq;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 85c5090bfe25..5aed3718fde8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
 #define TCPOPT_COOKIE		253	/* Cookie extension (experimental) */
+#define TCPOPT_EXP		254	/* Experimental */
+/* Magic number to be after the option value for sharing TCP
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
+ */
+#define TCPOPT_FASTOPEN_MAGIC	0xF989
 
 /*
  *     TCP option lengths
@@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERM      2
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
+#define TCPOLEN_EXP_FASTOPEN_BASE  4
 #define TCPOLEN_COOKIE_BASE    2	/* Cookie-less header extension */
 #define TCPOLEN_COOKIE_PAIR    3	/* Cookie pair header extension */
 #define TCPOLEN_COOKIE_MIN     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
@@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
@@ -418,7 +425,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t len, int nonblock, int flags, int *addr_len);
 extern void tcp_parse_options(const struct sk_buff *skb,
 			      struct tcp_options_received *opt_rx, const u8 **hvpp,
-			      int estab);
+			      int estab, struct tcp_fastopen_cookie *foc);
 extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a677d804e53e..ae2ccf2890e4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o tcp_metrics.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7fb15d1..650e1528e1e6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3f6a1e762e9c..5840c3255721 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -366,6 +366,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
+	{
+		.procname	= "tcp_fastopen",
+		.data		= &sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_tw_recycle",
 		.data		= &tcp_death_row.sysctl_tw_recycle,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000000..a7f729c409d7
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+int sysctl_tcp_fastopen;
+
+static int __init tcp_fastopen_init(void)
+{
+	return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fdd49f1b7a52..a06bb8959e7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3732,7 +3732,8 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       const u8 **hvpp, int estab)
+		       const u8 **hvpp, int estab,
+		       struct tcp_fastopen_cookie *foc)
 {
 	const unsigned char *ptr;
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
 					break;
 				}
 				break;
-			}
 
+			case TCPOPT_EXP:
+				/* Fast Open option shares code 254 using a
+				 * 16 bits magic number. It's valid only in
+				 * SYN or SYN-ACK with an even size.
+				 */
+				if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+				    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+				    foc == NULL || !th->syn || (opsize & 1))
+					break;
+				foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+				if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+				    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+					memcpy(foc->val, ptr + 2, foc->len);
+				else if (foc->len != 0)
+					foc->len = -1;
+				break;
+
+			}
 			ptr += opsize-2;
 			length -= opsize;
 		}
@@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return true;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
 	return true;
 }
 
@@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_cookie_values *cvp = tp->cookie_values;
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d2fa50f07f..01aa77a97020 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1307,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c66f2ede160e..5912ac3fd240 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 15a7c7bc3e58..4849be76ccd6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_COOKIE_EXTENSION	(1 << 4)
+#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
 
 struct tcp_out_options {
-	u8 options;		/* bit field of OPTION_* */
+	u16 options;		/* bit field of OPTION_* */
+	u16 mss;		/* 0 to disable */
 	u8 ws;			/* window scale, 0 to disable */
 	u8 num_sack_blocks;	/* number of SACK blocks to include */
 	u8 hash_size;		/* bytes in hash_location */
-	u16 mss;		/* 0 to disable */
-	__u32 tsval, tsecr;	/* need to include OPTION_TS */
 	__u8 *hash_location;	/* temporary pointer, overloaded */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
 };
 
 /* The sysctl int routines are generic, so check consistency here.
@@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 			      struct tcp_out_options *opts)
 {
-	u8 options = opts->options;	/* mungable copy */
+	u16 options = opts->options;	/* mungable copy */
 
 	/* Having both authentication and cookies for security is redundant,
 	 * and there's certainly not enough room.  Instead, the cookie-less
@@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
 		tp->rx_opt.dsack = 0;
 	}
+
+	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+		*ptr++ = htonl((TCPOPT_EXP << 24) |
+			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+			       TCPOPT_FASTOPEN_MAGIC);
+
+		memcpy(ptr, foc->val, foc->len);
+		if ((foc->len & 3) == 2) {
+			u8 *align = ((u8 *)ptr) + foc->len;
+			align[0] = align[1] = TCPOPT_NOP;
+		}
+		ptr += (foc->len + 3) >> 2;
+	}
 }
 
 /* Compute TCP options for SYN packets. This is not the final
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7bf3cc427c28..bb46061c813a 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c9dabdd832d7..0302ec3fecfc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
-- 
cgit v1.2.3


From 1fe4c481ba637660793217769695c146a037bd54 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:06 +0000
Subject: net-tcp: Fast Open client - cookie cache

With help from Eric Dumazet, add Fast Open metrics in tcp metrics cache.
The basic ones are MSS and the cookies. Later patch will cache more to
handle unfriendly middleboxes.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  4 ++++
 net/ipv4/tcp_metrics.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5aed3718fde8..e601da197361 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -405,6 +405,10 @@ extern void tcp_metrics_init(void);
 extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
 extern bool tcp_remember_stamp(struct sock *sk);
 extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+				   struct tcp_fastopen_cookie *cookie);
+extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+				   struct tcp_fastopen_cookie *cookie);
 extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 1a115b665792..d02ff3777785 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -30,6 +30,11 @@ enum tcp_metric_index {
 	TCP_METRIC_MAX,
 };
 
+struct tcp_fastopen_metrics {
+	u16	mss;
+	struct	tcp_fastopen_cookie	cookie;
+};
+
 struct tcp_metrics_block {
 	struct tcp_metrics_block __rcu	*tcpm_next;
 	struct inetpeer_addr		tcpm_addr;
@@ -38,6 +43,7 @@ struct tcp_metrics_block {
 	u32				tcpm_ts_stamp;
 	u32				tcpm_lock;
 	u32				tcpm_vals[TCP_METRIC_MAX];
+	struct tcp_fastopen_metrics	tcpm_fastopen;
 };
 
 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -118,6 +124,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
 	tm->tcpm_ts = 0;
 	tm->tcpm_ts_stamp = 0;
+	tm->tcpm_fastopen.mss = 0;
+	tm->tcpm_fastopen.cookie.len = 0;
 }
 
 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -633,6 +641,49 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 	return ret;
 }
 
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&fastopen_seqlock);
+			if (tfom->mss)
+				*mss = tfom->mss;
+			*cookie = tfom->cookie;
+		} while (read_seqretry(&fastopen_seqlock, seq));
+	}
+	rcu_read_unlock();
+}
+
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+		write_seqlock_bh(&fastopen_seqlock);
+		tfom->mss = mss;
+		if (cookie->len > 0)
+			tfom->cookie = *cookie;
+		write_sequnlock_bh(&fastopen_seqlock);
+	}
+	rcu_read_unlock();
+}
+
 static unsigned long tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
-- 
cgit v1.2.3


From 783237e8daf13481ee234997cbbbb823872ac388 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:07 +0000
Subject: net-tcp: Fast Open client - sending SYN-data

This patch implements sending SYN-data in tcp_connect(). The data is
from tcp_sendmsg() with flag MSG_FASTOPEN (implemented in a later patch).

The length of the cookie in tcp_fastopen_req, init'd to 0, controls the
type of the SYN. If the cookie is not cached (len==0), the host sends
data-less SYN with Fast Open cookie request option to solicit a cookie
from the remote. If cookie is not available (len > 0), the host sends
a SYN-data with Fast Open cookie option. If cookie length is negative,
  the SYN will not include any Fast Open option (for fall back operations).

To deal with middleboxes that may drop SYN with data or experimental TCP
option, the SYN-data is only sent once. SYN retransmits do not include
data or Fast Open options. The connection will fall back to regular TCP
handshake.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h  |   1 +
 include/linux/tcp.h   |   6 ++-
 include/net/tcp.h     |   9 ++++
 net/ipv4/af_inet.c    |  10 ++++-
 net/ipv4/proc.c       |   1 +
 net/ipv4/tcp_output.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 130 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index e5fcbd079e4a..00bc189cb395 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -238,6 +238,7 @@ enum
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
 	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	LINUX_MIB_TCPSYNCHALLENGE,		/* TCPSYNChallenge */
+	LINUX_MIB_TCPFASTOPENACTIVE,		/* TCPFastOpenActive */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 12948f543839..1edf96afab44 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -386,7 +386,8 @@ struct tcp_sock {
 		unused      : 1;
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		early_retrans_delayed:1; /* Delayed ER timer installed */
+		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
@@ -500,6 +501,9 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+/* TCP fastopen related information */
+	struct tcp_fastopen_request *fastopen_req;
+
 	/* When the cookie options are generated and exchanged, then this
 	 * object holds a reference to them (cookie_values->kref).  Also
 	 * contains related tcp_cookie_transactions fields.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e601da197361..867557b4244a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1289,6 +1289,15 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
 extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
 			    const struct tcp_md5sig_key *key);
 
+struct tcp_fastopen_request {
+	/* Fast Open cookie. Size 0 means a cookie request */
+	struct tcp_fastopen_cookie	cookie;
+	struct msghdr			*data;  /* data in MSG_FASTOPEN */
+	u16				copied;	/* queued in tcp_connect() */
+};
+
+void tcp_free_fastopen_req(struct tcp_sock *tp);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 07a02f6e9696..edc414625be2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -556,11 +556,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 }
 EXPORT_SYMBOL(inet_dgram_connect);
 
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
 {
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sk->sk_write_pending += writebias;
 
 	/* Basic assumption: if someone sets sk->sk_err, he _must_
 	 * change state of the socket from TCP_SYN_*.
@@ -576,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
 	finish_wait(sk_sleep(sk), &wait);
+	sk->sk_write_pending -= writebias;
 	return timeo;
 }
 
@@ -634,8 +636,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+				tcp_sk(sk)->fastopen_req &&
+				tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
 		/* Error code is set above */
-		if (!timeo || !inet_wait_for_connect(sk, timeo))
+		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
 			goto out;
 
 		err = sock_intr_errno(timeo);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5240b2ea61..957acd12250b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
 	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4849be76ccd6..88693281da4c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -596,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
 			 tcp_cookie_size_check(cvp->cookie_desired) :
 			 0;
+	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -636,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
 
+	if (fastopen && fastopen->cookie.len >= 0) {
+		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = &fastopen->cookie;
+			remaining -= need;
+			tp->syn_fastopen = 1;
+		}
+	}
 	/* Note that timestamps are required by the specification.
 	 *
 	 * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -2824,6 +2835,96 @@ void tcp_connect_init(struct sock *sk)
 	tcp_clear_retrans(tp);
 }
 
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	tcb->end_seq += skb->len;
+	skb_header_release(skb);
+	__tcp_add_write_queue_tail(sk, skb);
+	sk->sk_wmem_queued += skb->truesize;
+	sk_mem_charge(sk, skb->truesize);
+	tp->write_seq = tcb->end_seq;
+	tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_fastopen_request *fo = tp->fastopen_req;
+	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	struct sk_buff *syn_data = NULL, *data;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
+	if (fo->cookie.len <= 0)
+		goto fallback;
+
+	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+	 * user-MSS. Reserve maximum option space for middleboxes that add
+	 * private TCP options. The cost is reduced data space in SYN :(
+	 */
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+		MAX_TCP_OPTION_SPACE;
+
+	syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+				   sk->sk_allocation);
+	if (syn_data == NULL)
+		goto fallback;
+
+	for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+		struct iovec *iov = &fo->data->msg_iov[i];
+		unsigned char __user *from = iov->iov_base;
+		int len = iov->iov_len;
+
+		if (syn_data->len + len > space)
+			len = space - syn_data->len;
+		else if (i + 1 == iovlen)
+			/* No more data pending in inet_wait_for_connect() */
+			fo->data = NULL;
+
+		if (skb_add_data(syn_data, from, len))
+			goto fallback;
+	}
+
+	/* Queue a data-only packet after the regular SYN for retransmission */
+	data = pskb_copy(syn_data, sk->sk_allocation);
+	if (data == NULL)
+		goto fallback;
+	TCP_SKB_CB(data)->seq++;
+	TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+	TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+	tcp_connect_queue_skb(sk, data);
+	fo->copied = data->len;
+
+	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+		goto done;
+	}
+	syn_data = NULL;
+
+fallback:
+	/* Send a regular SYN with Fast Open cookie request option */
+	if (fo->cookie.len > 0)
+		fo->cookie.len = 0;
+	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+	if (err)
+		tp->syn_fastopen = 0;
+	kfree_skb(syn_data);
+done:
+	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
+	return err;
+}
+
 /* Build a SYN and send it off. */
 int tcp_connect(struct sock *sk)
 {
@@ -2841,17 +2942,13 @@ int tcp_connect(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+	tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tcp_connect_queue_skb(sk, buff);
 	TCP_ECN_send_syn(sk, buff);
 
-	/* Send it off. */
-	TCP_SKB_CB(buff)->when = tcp_time_stamp;
-	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
-	skb_header_release(buff);
-	__tcp_add_write_queue_tail(sk, buff);
-	sk->sk_wmem_queued += buff->truesize;
-	sk_mem_charge(sk, buff->truesize);
-	tp->packets_out += tcp_skb_pcount(buff);
-	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+	/* Send off SYN; include data in Fast Open. */
+	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
 	if (err == -ECONNREFUSED)
 		return err;
 
-- 
cgit v1.2.3


From 8e4178c1c7b52f7c99f5fd22ef7af6b2bff409e3 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:08 +0000
Subject: net-tcp: Fast Open client - receiving SYN-ACK

On receiving the SYN-ACK after SYN-data, the client needs to
a) update the cached MSS and cookie (if included in SYN-ACK)
b) retransmit the data not yet acknowledged by the SYN-ACK in the final ACK of
   the handshake.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a06bb8959e7e..38b6a811edfc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5646,6 +5646,34 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+				    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *data = tcp_write_queue_head(sk);
+	u16 mss = tp->rx_opt.mss_clamp;
+
+	if (mss == tp->rx_opt.user_mss) {
+		struct tcp_options_received opt;
+		const u8 *hash_location;
+
+		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
+		tcp_clear_options(&opt);
+		opt.user_mss = opt.mss_clamp = 0;
+		tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+		mss = opt.mss_clamp;
+	}
+
+	tcp_fastopen_cache_set(sk, mss, cookie);
+
+	if (data) { /* Retransmit unacked data in SYN */
+		tcp_retransmit_skb(sk, data);
+		tcp_rearm_rto(sk);
+		return true;
+	}
+	return false;
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th, unsigned int len)
 {
@@ -5653,9 +5681,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_cookie_values *cvp = tp->cookie_values;
+	struct tcp_fastopen_cookie foc = { .len = -1 };
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
 
 	if (th->ack) {
 		/* rfc793:
@@ -5665,11 +5694,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
 		 *        a reset (unless the RST bit is set, if so drop
 		 *        the segment and return)"
-		 *
-		 *  We do not send data with SYN, so that RFC-correct
-		 *  test reduces to:
 		 */
-		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+		if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+		    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
 			goto reset_and_undo;
 
 		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5781,6 +5808,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_finish_connect(sk, skb);
 
+		if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+			return -1;
+
 		if (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
 		    icsk->icsk_ack.pingpong) {
-- 
cgit v1.2.3


From cf60af03ca4e71134206809ea892e49b92a88896 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:09 +0000
Subject: net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)

sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2)
and write(2). The application should replace connect() with it to
send data in the opening SYN packet.

For blocking socket, sendmsg() blocks until all the data are buffered
locally and the handshake is completed like connect() call. It
returns similar errno like connect() if the TCP handshake fails.

For non-blocking socket, it returns the number of bytes queued (and
transmitted in the SYN-data packet) if cookie is available. If cookie
is not available, it transmits a data-less SYN packet with Fast Open
cookie request option and returns -EINPROGRESS like connect().

Using MSG_FASTOPEN on connecting or connected socket will result in
simlar errno like repeating connect() calls. Therefore the application
should only use this flag on new sockets.

The buffer size of sendmsg() is independent of the MSS of the connection.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 11 ++++++
 include/linux/socket.h                 |  1 +
 include/net/inet_common.h              |  6 ++--
 include/net/tcp.h                      |  3 ++
 net/ipv4/af_inet.c                     | 19 ++++++++---
 net/ipv4/tcp.c                         | 61 +++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c                    |  3 ++
 7 files changed, 92 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e1e021594cff..03964e088180 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN
 	SYN flood warnings in logs not being really flooded, your server
 	is seriously misconfigured.
 
+tcp_fastopen - INTEGER
+	Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
+	in the opening SYN packet. To use this feature, the client application
+	must not use connect(). Instead, it should use sendmsg() or sendto()
+	with MSG_FASTOPEN flag which performs a TCP handshake automatically.
+
+	The values (bitmap) are:
+	1: Enables sending data in the opening SYN on the client
+
+	Default: 0
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 255. Default value
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322fb635..ba7b2e817cfa 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -268,6 +268,7 @@ struct ucred {
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_EOF         MSG_FIN
 
+#define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exit for file
 					   descriptor received through
 					   SCM_RIGHTS */
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac9892b16..234008782c8c 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -14,9 +14,11 @@ struct sockaddr;
 struct socket;
 
 extern int inet_release(struct socket *sock);
-extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			       int addr_len, int flags);
-extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+				 int addr_len, int flags);
+extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 			      int addr_len, int flags);
 extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
 extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 867557b4244a..c0258100d70c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
 #define TCP_INIT_CWND		10
 
+/* Bit Flags for sysctl_tcp_fastopen */
+#define	TFO_CLIENT_ENABLE	1
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index edc414625be2..fe4582ca969a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -585,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
  *	Connect to a remote host. There is regrettably still a little
  *	TCP 'magic' in here.
  */
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			  int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	int err;
@@ -595,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (addr_len < sizeof(uaddr->sa_family))
 		return -EINVAL;
 
-	lock_sock(sk);
-
 	if (uaddr->sa_family == AF_UNSPEC) {
 		err = sk->sk_prot->disconnect(sk, flags);
 		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -663,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	sock->state = SS_CONNECTED;
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 
 sock_error:
@@ -673,6 +670,18 @@ sock_error:
 		sock->state = SS_DISCONNECTING;
 	goto out;
 }
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	int err;
+
+	lock_sock(sock->sk);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+	release_sock(sock->sk);
+	return err;
+}
 EXPORT_SYMBOL(inet_stream_connect);
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4252cd8f39fd..581ecf02c6b5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/slab.h>
 
 #include <net/icmp.h>
+#include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
 	return tmp;
 }
 
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+	if (tp->fastopen_req != NULL) {
+		kfree(tp->fastopen_req);
+		tp->fastopen_req = NULL;
+	}
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err, flags;
+
+	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+		return -EOPNOTSUPP;
+	if (tp->fastopen_req != NULL)
+		return -EALREADY; /* Another Fast Open is in progress */
+
+	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+				   sk->sk_allocation);
+	if (unlikely(tp->fastopen_req == NULL))
+		return -ENOBUFS;
+	tp->fastopen_req->data = msg;
+
+	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+				    msg->msg_namelen, flags);
+	*size = tp->fastopen_req->copied;
+	tcp_free_fastopen_req(tp);
+	return err;
+}
+
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
 	struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int iovlen, flags, err, copied;
-	int mss_now = 0, size_goal;
+	int iovlen, flags, err, copied = 0;
+	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
 	bool sg;
 	long timeo;
 
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
+	if (flags & MSG_FASTOPEN) {
+		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+		if (err == -EINPROGRESS && copied_syn > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+		offset = copied_syn;
+	}
+
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	/* Wait for a connection to finish. */
 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
-			goto out_err;
+			goto do_error;
 
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		unsigned char __user *from = iov->iov_base;
 
 		iov++;
+		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
+			if (offset >= seglen) {
+				offset -= seglen;
+				continue;
+			}
+			seglen -= offset;
+			from += offset;
+			offset = 0;
+		}
 
 		while (seglen > 0) {
 			int copy = 0;
@@ -1199,7 +1250,7 @@ out:
 	if (copied && likely(!tp->repair))
 		tcp_push(sk, flags, mss_now, tp->nonagle);
 	release_sock(sk);
-	return copied;
+	return copied + copied_syn;
 
 do_fault:
 	if (!skb->len) {
@@ -1212,7 +1263,7 @@ do_fault:
 	}
 
 do_error:
-	if (copied)
+	if (copied + copied_syn)
 		goto out;
 out_err:
 	err = sk_stream_error(sk, flags, err);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 01aa77a97020..1d8b75a58981 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1952,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
+	/* If socket is aborted during connect operation */
+	tcp_free_fastopen_req(tp);
+
 	sk_sockets_allocated_dec(sk);
 	sock_release_memcg(sk);
 }
-- 
cgit v1.2.3


From aab4874355679c70f93993cf3b3fd74643b9ac33 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:10 +0000
Subject: net-tcp: Fast Open client - detecting SYN-data drops

On paths with firewalls dropping SYN with data or experimental TCP options,
Fast Open connections will have experience SYN timeout and bad performance.
The solution is to track such incidents in the cookie cache and disables
Fast Open temporarily.

Since only the original SYN includes data and/or Fast Open option, the
SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect
such drops. If a path has recurring Fast Open SYN drops, Fast Open is
disabled for 2^(recurring_losses) minutes starting from four minutes up to
roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but
it behaves as connect() then write().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  6 ++++--
 net/ipv4/tcp_input.c   | 10 +++++++++-
 net/ipv4/tcp_metrics.c | 16 +++++++++++++---
 net/ipv4/tcp_output.c  | 13 +++++++++++--
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c0258100d70c..e07878d246aa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,9 +409,11 @@ extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
 extern bool tcp_remember_stamp(struct sock *sk);
 extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   int *syn_loss, unsigned long *last_syn_loss);
 extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   bool syn_lost);
 extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38b6a811edfc..c49a4fc175bd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5652,6 +5652,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *data = tcp_write_queue_head(sk);
 	u16 mss = tp->rx_opt.mss_clamp;
+	bool syn_drop;
 
 	if (mss == tp->rx_opt.user_mss) {
 		struct tcp_options_received opt;
@@ -5664,7 +5665,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
-	tcp_fastopen_cache_set(sk, mss, cookie);
+	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+	 * the remote receives only the retransmitted (regular) SYNs: either
+	 * the original SYN-data or the corresponding SYN-ACK is lost.
+	 */
+	syn_drop = (cookie->len <= 0 && data &&
+		    inet_csk(sk)->icsk_retransmits);
+
+	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
 
 	if (data) { /* Retransmit unacked data in SYN */
 		tcp_retransmit_skb(sk, data);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d02ff3777785..99779ae44f64 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -32,6 +32,8 @@ enum tcp_metric_index {
 
 struct tcp_fastopen_metrics {
 	u16	mss;
+	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
+	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
 	struct	tcp_fastopen_cookie	cookie;
 };
 
@@ -125,6 +127,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_ts = 0;
 	tm->tcpm_ts_stamp = 0;
 	tm->tcpm_fastopen.mss = 0;
+	tm->tcpm_fastopen.syn_loss = 0;
 	tm->tcpm_fastopen.cookie.len = 0;
 }
 
@@ -644,7 +647,8 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 static DEFINE_SEQLOCK(fastopen_seqlock);
 
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie,
+			    int *syn_loss, unsigned long *last_syn_loss)
 {
 	struct tcp_metrics_block *tm;
 
@@ -659,14 +663,15 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			if (tfom->mss)
 				*mss = tfom->mss;
 			*cookie = tfom->cookie;
+			*syn_loss = tfom->syn_loss;
+			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
 		} while (read_seqretry(&fastopen_seqlock, seq));
 	}
 	rcu_read_unlock();
 }
 
-
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost)
 {
 	struct tcp_metrics_block *tm;
 
@@ -679,6 +684,11 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 		tfom->mss = mss;
 		if (cookie->len > 0)
 			tfom->cookie = *cookie;
+		if (syn_lost) {
+			++tfom->syn_loss;
+			tfom->last_syn_loss = jiffies;
+		} else
+			tfom->syn_loss = 0;
 		write_sequnlock_bh(&fastopen_seqlock);
 	}
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 88693281da4c..c5cfd5ec3184 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2860,10 +2860,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
 	struct sk_buff *syn_data = NULL, *data;
+	unsigned long last_syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+			       &syn_loss, &last_syn_loss);
+	/* Recurring FO SYN losses: revert to regular handshake temporarily */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		fo->cookie.len = -1;
+		goto fallback;
+	}
 
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
 	if (fo->cookie.len <= 0)
 		goto fallback;
 
-- 
cgit v1.2.3


From 67da22d23fa6f3324e03bcd0580b914b2e4afbf3 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:11 +0000
Subject: net-tcp: Fast Open client - cookie-less mode

In trusted networks, e.g., intranet, data-center, the client does not
need to use Fast Open cookie to mitigate DoS attacks. In cookie-less
mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless
of cookie availability.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 2 ++
 include/linux/tcp.h                    | 1 +
 include/net/tcp.h                      | 1 +
 net/ipv4/tcp_input.c                   | 8 ++++++--
 net/ipv4/tcp_output.c                  | 6 +++++-
 5 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 03964e088180..5f3ef7f7fcec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -476,6 +476,8 @@ tcp_fastopen - INTEGER
 
 	The values (bitmap) are:
 	1: Enables sending data in the opening SYN on the client
+	5: Enables sending data in the opening SYN on the client regardless
+	   of cookie availability.
 
 	Default: 0
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1edf96afab44..9febfb685c33 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -387,6 +387,7 @@ struct tcp_sock {
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e07878d246aa..bc7c134ec054 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -214,6 +214,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* Bit Flags for sysctl_tcp_fastopen */
 #define	TFO_CLIENT_ENABLE	1
+#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
 
 extern struct inet_timewait_death_row tcp_death_row;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c49a4fc175bd..e67d685a6c0e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5650,7 +5650,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 				    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *data = tcp_write_queue_head(sk);
+	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
 	u16 mss = tp->rx_opt.mss_clamp;
 	bool syn_drop;
 
@@ -5665,6 +5665,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
+	if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
+		cookie->len = -1;
+
 	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
 	 * the remote receives only the retransmitted (regular) SYNs: either
 	 * the original SYN-data or the corresponding SYN-ACK is lost.
@@ -5816,7 +5819,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_finish_connect(sk, skb);
 
-		if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+		if ((tp->syn_fastopen || tp->syn_data) &&
+		    tcp_rcv_fastopen_synack(sk, skb, &foc))
 			return -1;
 
 		if (sk->sk_write_pending ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5cfd5ec3184..27a32acfdb62 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2864,6 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	struct sk_buff *syn_data = NULL, *data;
 	unsigned long last_syn_loss = 0;
 
+	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
 	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
 			       &syn_loss, &last_syn_loss);
 	/* Recurring FO SYN losses: revert to regular handshake temporarily */
@@ -2873,7 +2874,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 		goto fallback;
 	}
 
-	if (fo->cookie.len <= 0)
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+		fo->cookie.len = -1;
+	else if (fo->cookie.len <= 0)
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
@@ -2916,6 +2919,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	fo->copied = data->len;
 
 	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		tp->syn_data = (fo->copied > 0);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
 		goto done;
 	}
-- 
cgit v1.2.3


From f31fd383821555cbd77ee83e17837f7060825395 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 19 Jul 2012 23:02:45 +0300
Subject: ipv4: Fix again the time difference calculation

	Fix again the diff value in rt_bind_exception
after collision of two latest patches, my original commit
actually fixed the same problem.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 89e39dc5336b..9f7ffbe201c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1926,7 +1926,7 @@ restart:
 		if (daddr != fnhe_daddr)
 			continue;
 		if (pmtu) {
-			unsigned long diff = jiffies - expires;
+			unsigned long diff = expires - jiffies;
 
 			if (time_before(jiffies, expires)) {
 				rt->rt_pmtu = pmtu;
-- 
cgit v1.2.3


From 67b95bd78f0de85793bf30835913f6ef784a39b6 Mon Sep 17 00:00:00 2001
From: Vijay Subramanian <subramanian.vijay@gmail.com>
Date: Thu, 19 Jul 2012 21:32:18 +0000
Subject: tcp: Return bool instead of int where appropriate

Applied to a set of static inline functions in tcp_input.c

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e67d685a6c0e..21d7f8f3a7a5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2521,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
 /* Nothing was retransmitted or returned timestamp is less
  * than timestamp of the first retransmission.
  */
-static inline int tcp_packet_delayed(const struct tcp_sock *tp)
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
 {
 	return !tp->retrans_stamp ||
 		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2582,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static inline int tcp_may_undo(const struct tcp_sock *tp)
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
 {
 	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
 }
@@ -3371,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk)
 	}
 }
 
-static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
 	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
 		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 
-static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
@@ -3387,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
-static inline int tcp_may_update_window(const struct tcp_sock *tp,
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 					const u32 ack, const u32 ack_seq,
 					const u32 nwin)
 {
@@ -4006,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
 
-static inline int tcp_paws_discard(const struct sock *sk,
+static inline bool tcp_paws_discard(const struct sock *sk,
 				   const struct sk_buff *skb)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
@@ -4028,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk,
  * (borrowed from freebsd)
  */
 
-static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 {
 	return	!before(end_seq, tp->rcv_wup) &&
 		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
@@ -5214,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
 	return result;
 }
 
-static inline int tcp_checksum_complete_user(struct sock *sk,
+static inline bool tcp_checksum_complete_user(struct sock *sk,
 					     struct sk_buff *skb)
 {
 	return !skb_csum_unnecessary(skb) &&
-- 
cgit v1.2.3


From 5815d5e7aae3cc9c5e85af83094d4d6498c3f4fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Jul 2012 23:02:34 +0000
Subject: tcp: use hash_32() in tcp_metrics

Fix a missing roundup_pow_of_two(), since tcpmhash_entries is not
guaranteed to be a power of two.

Uses hash_32() instead of custom hash.

tcpmhash_entries should be an unsigned int.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |  2 +-
 net/ipv4/tcp_metrics.c   | 25 ++++++++++---------------
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d909c7fc3da1..0ffb8e31f3cd 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -40,7 +40,7 @@ struct netns_ipv4 {
 	struct sock		**icmp_sk;
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
-	unsigned int		tcp_metrics_hash_mask;
+	unsigned int		tcp_metrics_hash_log;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 99779ae44f64..992f1bff4fc6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/tcp.h>
+#include <linux/hash.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/net_namespace.h>
@@ -228,10 +229,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 		return NULL;
 	}
 
-	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
-
 	net = dev_net(dst->dev);
-	hash &= net->ipv4.tcp_metrics_hash_mask;
+	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
 
 	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
 	     tm = rcu_dereference(tm->tcpm_next)) {
@@ -265,10 +264,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
 		return NULL;
 	}
 
-	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
-
 	net = twsk_net(tw);
-	hash &= net->ipv4.tcp_metrics_hash_mask;
+	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
 
 	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
 	     tm = rcu_dereference(tm->tcpm_next)) {
@@ -302,10 +299,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 		return NULL;
 	}
 
-	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
-
 	net = dev_net(dst->dev);
-	hash &= net->ipv4.tcp_metrics_hash_mask;
+	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
 
 	tm = __tcp_get_metrics(&addr, net, hash);
 	reclaim = false;
@@ -694,7 +689,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 	rcu_read_unlock();
 }
 
-static unsigned long tcpmhash_entries;
+static unsigned int tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
 	ssize_t ret;
@@ -702,7 +697,7 @@ static int __init set_tcpmhash_entries(char *str)
 	if (!str)
 		return 0;
 
-	ret = kstrtoul(str, 0, &tcpmhash_entries);
+	ret = kstrtouint(str, 0, &tcpmhash_entries);
 	if (ret)
 		return 0;
 
@@ -712,7 +707,8 @@ __setup("tcpmhash_entries=", set_tcpmhash_entries);
 
 static int __net_init tcp_net_metrics_init(struct net *net)
 {
-	int slots, size;
+	size_t size;
+	unsigned int slots;
 
 	slots = tcpmhash_entries;
 	if (!slots) {
@@ -722,14 +718,13 @@ static int __net_init tcp_net_metrics_init(struct net *net)
 			slots = 8 * 1024;
 	}
 
-	size = slots * sizeof(struct tcpm_hash_bucket);
+	net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
+	size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
 
 	net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
 	if (!net->ipv4.tcp_metrics_hash)
 		return -ENOMEM;
 
-	net->ipv4.tcp_metrics_hash_mask = (slots - 1);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9dc274151a548ffd215caecec5a8872db8799447 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jul 2012 05:02:33 +0000
Subject: tcp: fix ABC in tcp_slow_start()

When/if sysctl_tcp_abc > 1, we expect to increase cwnd by 2 if the
received ACK acknowledges more than 2*MSS bytes, in tcp_slow_start()

Problem is this RFC 3465 statement is not correctly coded, as
the while () loop increases snd_cwnd one by one.

Add a new variable to avoid this off-by one error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: John Heffner <johnwheffner@gmail.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cong.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 04dbd7ae7c62..4d4db16e336e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
 void tcp_slow_start(struct tcp_sock *tp)
 {
 	int cnt; /* increase in packets */
+	unsigned int delta = 0;
 
 	/* RFC3465: ABC Slow start
 	 * Increase only after a full MSS of bytes is acked
@@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp)
 	tp->snd_cwnd_cnt += cnt;
 	while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
 		tp->snd_cwnd_cnt -= tp->snd_cwnd;
-		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-			tp->snd_cwnd++;
+		delta++;
 	}
+	tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
-- 
cgit v1.2.3


From 6f458dfb409272082c9bfa412f77ff2fc21c626f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jul 2012 05:45:50 +0000
Subject: tcp: improve latencies of timer triggered events

Modern TCP stack highly depends on tcp_write_timer() having a small
latency, but current implementation doesn't exactly meet the
expectations.

When a timer fires but finds the socket is owned by the user, it rearms
itself for an additional delay hoping next run will be more
successful.

tcp_write_timer() for example uses a 50ms delay for next try, and it
defeats many attempts to get predictable TCP behavior in term of
latencies.

Use the recently introduced tcp_release_cb(), so that the user owning
the socket will call various handlers right before socket release.

This will permit us to post a followup patch to address the
tcp_tso_should_defer() syndrome (some deferred packets have to wait
RTO timer to be transmitted, while cwnd should allow us to send them
sooner)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: H.K. Jerry Chu <hkchu@google.com>
Cc: John Heffner <johnwheffner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  4 ++-
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_output.c | 46 ++++++++++++++++++++-------------
 net/ipv4/tcp_timer.c  | 70 +++++++++++++++++++++++++++------------------------
 4 files changed, 71 insertions(+), 51 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9febfb685c33..2761856987b2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -515,7 +515,9 @@ struct tcp_sock {
 enum tsq_flags {
 	TSQ_THROTTLED,
 	TSQ_QUEUED,
-	TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
+	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
+	TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
+	TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bc7c134ec054..e19124b84cd2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -350,6 +350,8 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 			size_t size, int flags);
 extern void tcp_release_cb(struct sock *sk);
+extern void tcp_write_timer_handler(struct sock *sk);
+extern void tcp_delack_timer_handler(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 const struct tcphdr *th, unsigned int len);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 27a32acfdb62..950aebfd9967 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -837,6 +837,13 @@ struct tsq_tasklet {
 };
 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 
+static void tcp_tsq_handler(struct sock *sk)
+{
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
+		tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+}
 /*
  * One tasklest per cpu tries to send more skbs.
  * We run in tasklet context but need to disable irqs when
@@ -864,16 +871,10 @@ static void tcp_tasklet_func(unsigned long data)
 		bh_lock_sock(sk);
 
 		if (!sock_owned_by_user(sk)) {
-			if ((1 << sk->sk_state) &
-			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-			     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
-				tcp_write_xmit(sk,
-					       tcp_current_mss(sk),
-					       0, 0,
-					       GFP_ATOMIC);
+			tcp_tsq_handler(sk);
 		} else {
 			/* defer the work to tcp_release_cb() */
-			set_bit(TSQ_OWNED, &tp->tsq_flags);
+			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
 		}
 		bh_unlock_sock(sk);
 
@@ -882,6 +883,9 @@ static void tcp_tasklet_func(unsigned long data)
 	}
 }
 
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
+			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
+			  (1UL << TCP_DELACK_TIMER_DEFERRED))
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -892,16 +896,24 @@ static void tcp_tasklet_func(unsigned long data)
 void tcp_release_cb(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long flags, nflags;
 
-	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
-		if ((1 << sk->sk_state) &
-		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-		     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
-			tcp_write_xmit(sk,
-				       tcp_current_mss(sk),
-				       0, 0,
-				       GFP_ATOMIC);
-	}
+	/* perform an atomic operation only if at least one flag is set */
+	do {
+		flags = tp->tsq_flags;
+		if (!(flags & TCP_DEFERRED_ALL))
+			return;
+		nflags = flags & ~TCP_DEFERRED_ALL;
+	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+	if (flags & (1UL << TCP_TSQ_DEFERRED))
+		tcp_tsq_handler(sk);
+
+	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+		tcp_write_timer_handler(sk);
+
+	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+		tcp_delack_timer_handler(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e911e6c523ec..6df36ad55a38 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
-
-void tcp_init_xmit_timers(struct sock *sk)
-{
-	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-				  &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-
 static void tcp_write_err(struct sock *sk)
 {
 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk)
 	return 0;
 }
 
-static void tcp_delack_timer(unsigned long data)
+void tcp_delack_timer_handler(struct sock *sk)
 {
-	struct sock *sk = (struct sock *)data;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk)) {
-		/* Try again later. */
-		icsk->icsk_ack.blocked = 1;
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
-		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
-		goto out_unlock;
-	}
-
 	sk_mem_reclaim_partial(sk);
 
 	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data)
 out:
 	if (sk_under_memory_pressure(sk))
 		sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_delack_timer_handler(sk);
+	} else {
+		inet_csk(sk)->icsk_ack.blocked = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+		/* deleguate our work to tcp_release_cb() */
+		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
@@ -450,19 +443,11 @@ out_reset_timer:
 out:;
 }
 
-static void tcp_write_timer(unsigned long data)
+void tcp_write_timer_handler(struct sock *sk)
 {
-	struct sock *sk = (struct sock *)data;
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int event;
 
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk)) {
-		/* Try again later */
-		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
-		goto out_unlock;
-	}
-
 	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 		goto out;
 
@@ -485,7 +470,19 @@ static void tcp_write_timer(unsigned long data)
 
 out:
 	sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_write_timer_handler(sk);
+	} else {
+		/* deleguate our work to tcp_release_cb() */
+		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
@@ -602,3 +599,10 @@ out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
-- 
cgit v1.2.3


From 521f549097a79dc55e18c3bc752ef2127ad70ac5 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 20 Jul 2012 12:02:08 +0300
Subject: ipv4: show pmtu in route list

Override the metrics with rt_pmtu

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9f7ffbe201c9..d547f6fae20d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2909,6 +2909,7 @@ static int rt_fill_info(struct net *net,
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
 	u32 error;
+	u32 metrics[RTAX_MAX];
 
 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
 	if (nlh == NULL)
@@ -2953,7 +2954,10 @@ static int rt_fill_info(struct net *net,
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
-	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+	if (rt->rt_pmtu)
+		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
 	if (rt->rt_mark &&
-- 
cgit v1.2.3


From 89aef8921bfbac22f00e04f8450f6e447db13e42 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 11:00:09 -0700
Subject: ipv4: Delete routing cache.

The ipv4 routing cache is non-deterministic, performance wise, and is
subject to reasonably easy to launch denial of service attacks.

The routing cache works great for well behaved traffic, and the world
was a much friendlier place when the tradeoffs that led to the routing
cache's design were considered.

What it boils down to is that the performance of the routing cache is
a product of the traffic patterns seen by a system rather than being a
product of the contents of the routing tables.  The former of which is
controllable by external entitites.

Even for "well behaved" legitimate traffic, high volume sites can see
hit rates in the routing cache of only ~%10.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |   1 -
 net/ipv4/fib_frontend.c |   5 -
 net/ipv4/route.c        | 940 +-----------------------------------------------
 3 files changed, 13 insertions(+), 933 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index ace3cb442519..5dcfeb621e06 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
 struct in_device;
 extern int		ip_rt_init(void);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(struct net *net);
 extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
 					   struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b83203658ee3..f277cf0e6321 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		/* The batch unregister is only called on the first
-		 * device in the list of devices being unregistered.
-		 * Therefore we should not pass dev_net(dev) in here.
-		 */
-		rt_cache_flush_batch(NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d547f6fae20d..6d6146d31f22 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int rt_chain_length_max __read_mostly	= 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 
 /*
  *	Interface to generic destination cache.
@@ -152,7 +148,6 @@ static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 			    int how)
@@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
-	.gc =			rt_garbage_collect,
 	.check =		ipv4_dst_check,
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
@@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
 
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-
-struct rt_hash_bucket {
-	struct rtable __rcu	*chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ	256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ	4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ	2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ	1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ	512
-# else
-#  define RT_HASH_LOCK_SZ	256
-# endif
-#endif
-
-static spinlock_t	*rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
-	int i;
-
-	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-			GFP_KERNEL);
-	if (!rt_hash_locks)
-		panic("IP: failed to allocate rt_hash_locks\n");
-
-	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-		spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned int		rt_hash_mask __read_mostly;
-static unsigned int		rt_hash_log  __read_mostly;
-
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-				   int genid)
-{
-	return jhash_3words((__force u32)daddr, (__force u32)saddr,
-			    idx, genid)
-		& rt_hash_mask;
-}
-
 static inline int rt_genid(struct net *net)
 {
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-	struct seq_net_private p;
-	int bucket;
-	int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	struct rtable *r = NULL;
-
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
-			continue;
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-		while (r) {
-			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-			    r->rt_genid == st->genid)
-				return r;
-			r = rcu_dereference_bh(r->dst.rt_next);
-		}
-		rcu_read_unlock_bh();
-	}
-	return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-					  struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-
-	r = rcu_dereference_bh(r->dst.rt_next);
-	while (!r) {
-		rcu_read_unlock_bh();
-		do {
-			if (--st->bucket < 0)
-				return NULL;
-		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-					struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->dst.dev) != seq_file_net(seq))
-			continue;
-		if (r->rt_genid == st->genid)
-			break;
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct rtable *r = rt_cache_get_first(seq);
-
-	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
-			--pos;
-	return pos ? NULL : r;
-}
-
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct rt_cache_iter_state *st = seq->private;
 	if (*pos)
-		return rt_cache_get_idx(seq, *pos - 1);
-	st->genid = rt_genid(seq_file_net(seq));
+		return NULL;
 	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r;
-
-	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
-	else
-		r = rt_cache_get_next(seq, v);
 	++*pos;
-	return r;
+	return NULL;
 }
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 			   "HHUptod\tSpecDst");
-	else {
-		struct rtable *r = v;
-		int len;
-
-		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			   "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			   r->dst.dev ? r->dst.dev->name : "*",
-			   (__force u32)r->rt_dst,
-			   (__force u32)r->rt_gateway,
-			   r->rt_flags, atomic_read(&r->dst.__refcnt),
-			   r->dst.__use, 0, (__force u32)r->rt_src,
-			   dst_metric_advmss(&r->dst) + 40,
-			   dst_metric(&r->dst, RTAX_WINDOW), 0,
-			   r->rt_key_tos,
-			   -1, 0, 0, &len);
-
-		seq_printf(seq, "%*s\n", 127 - len, "");
-	}
 	return 0;
 }
 
@@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &rt_cache_seq_ops,
-			sizeof(struct rt_cache_iter_state));
+	return seq_open(file, &rt_cache_seq_ops);
 }
 
 static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = {
 	.open	 = rt_cache_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release_net,
+	.release = seq_release,
 };
 
 
@@ -625,262 +446,11 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
-	ip_rt_put(rt);
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
-	/* Kill broadcast/multicast entries very aggresively, if they
-	   collide in hash table with more useful entries */
-	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
-	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-	unsigned long age;
-	int ret = 0;
-
-	if (atomic_read(&rth->dst.__refcnt))
-		goto out;
-
-	age = jiffies - rth->dst.lastuse;
-	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-	    (age <= tmo2 && rt_valuable(rth)))
-		goto out;
-	ret = 1;
-out:	return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-	u32 score = jiffies - rt->dst.lastuse;
-
-	score = ~score & ~(3<<30);
-
-	if (rt_valuable(rt))
-		score |= (1<<31);
-
-	if (rt_is_output_route(rt) ||
-	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-		score |= (1<<30);
-
-	return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
-	return net->ipv4.current_rt_cache_rebuild_count <=
-		net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
-				       const struct rtable *rt2)
-{
-	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
-	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-		(rt1->rt_mark ^ rt2->rt_mark) |
-		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
-		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
-		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
 static inline int rt_is_expired(struct rtable *rth)
 {
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
-/*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-	unsigned int i;
-	struct rtable *rth, *next;
-
-	for (i = 0; i <= rt_hash_mask; i++) {
-		struct rtable __rcu **pprev;
-		struct rtable *list;
-
-		if (process_context && need_resched())
-			cond_resched();
-		rth = rcu_access_pointer(rt_hash_table[i].chain);
-		if (!rth)
-			continue;
-
-		spin_lock_bh(rt_hash_lock_addr(i));
-
-		list = NULL;
-		pprev = &rt_hash_table[i].chain;
-		rth = rcu_dereference_protected(*pprev,
-			lockdep_is_held(rt_hash_lock_addr(i)));
-
-		while (rth) {
-			next = rcu_dereference_protected(rth->dst.rt_next,
-				lockdep_is_held(rt_hash_lock_addr(i)));
-
-			if (!net ||
-			    net_eq(dev_net(rth->dst.dev), net)) {
-				rcu_assign_pointer(*pprev, next);
-				rcu_assign_pointer(rth->dst.rt_next, list);
-				list = rth;
-			} else {
-				pprev = &rth->dst.rt_next;
-			}
-			rth = next;
-		}
-
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; list; list = next) {
-			next = rcu_dereference_protected(list->dst.rt_next, 1);
-			rt_free(list);
-		}
-	}
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-	const struct rtable *aux = head;
-
-	while (aux != rth) {
-		if (compare_hash_inputs(aux, rth))
-			return 0;
-		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-	}
-	return ONE;
-}
-
-static void rt_check_expire(void)
-{
-	static unsigned int rover;
-	unsigned int i = rover, goal;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long samples = 0;
-	unsigned long sum = 0, sum2 = 0;
-	unsigned long delta;
-	u64 mult;
-
-	delta = jiffies - expires_ljiffies;
-	expires_ljiffies = jiffies;
-	mult = ((u64)delta) << rt_hash_log;
-	if (ip_rt_gc_timeout > 1)
-		do_div(mult, ip_rt_gc_timeout);
-	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask)
-		goal = rt_hash_mask + 1;
-	for (; goal > 0; goal--) {
-		unsigned long tmo = ip_rt_gc_timeout;
-		unsigned long length;
-
-		i = (i + 1) & rt_hash_mask;
-		rthp = &rt_hash_table[i].chain;
-
-		if (need_resched())
-			cond_resched();
-
-		samples++;
-
-		if (rcu_dereference_raw(*rthp) == NULL)
-			continue;
-		length = 0;
-		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
-			prefetch(rth->dst.rt_next);
-			if (rt_is_expired(rth) ||
-			    rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				continue;
-			}
-
-			/* We only count entries on a chain with equal
-			 * hash inputs once so that entries for
-			 * different QOS levels, and other non-hash
-			 * input attributes don't unfairly skew the
-			 * length computation
-			 */
-			tmo >>= 1;
-			rthp = &rth->dst.rt_next;
-			length += has_noalias(rt_hash_table[i].chain, rth);
-		}
-		spin_unlock_bh(rt_hash_lock_addr(i));
-		sum += length;
-		sum2 += length*length;
-	}
-	if (samples) {
-		unsigned long avg = sum / samples;
-		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-		rt_chain_length_max = max_t(unsigned long,
-					ip_rt_gc_elasticity,
-					(avg + 4*sd) >> FRACT_BITS);
-	}
-	rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-	rt_check_expire();
-	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
-	if (delay >= 0)
-		rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-	rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-	net_warn_ratelimited("Route hash chain too long!\n");
-	rt_cache_invalidate(net);
-}
-
-/*
-   Short description of GC goals.
-
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
-	static unsigned long expire = RT_GC_TIMEOUT;
-	static unsigned long last_gc;
-	static int rover;
-	static int equilibrium;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long now = jiffies;
-	int goal;
-	int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
-	/*
-	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
-	 */
-
-	RT_CACHE_STAT_INC(gc_total);
-
-	if (now - last_gc < ip_rt_gc_min_interval &&
-	    entries < ip_rt_max_size) {
-		RT_CACHE_STAT_INC(gc_ignored);
-		goto out;
-	}
-
-	entries = dst_entries_get_slow(&ipv4_dst_ops);
-	/* Calculate number of entries, which we want to expire now. */
-	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-	if (goal <= 0) {
-		if (equilibrium < ipv4_dst_ops.gc_thresh)
-			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = entries - equilibrium;
-		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = entries - equilibrium;
-		}
-	} else {
-		/* We are in dangerous area. Try to reduce cache really
-		 * aggressively.
-		 */
-		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = entries - goal;
-	}
-
-	if (now - last_gc >= ip_rt_gc_min_interval)
-		last_gc = now;
-
-	if (goal <= 0) {
-		equilibrium += goal;
-		goto work_done;
-	}
-
-	do {
-		int i, k;
-
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-			unsigned long tmo = expire;
-
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-				if (!rt_is_expired(rth) &&
-					!rt_may_expire(rth, tmo, expire)) {
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					continue;
-				}
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				goal--;
-			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
-			if (goal <= 0)
-				break;
-		}
-		rover = k;
-
-		if (goal <= 0)
-			goto work_done;
-
-		/* Goal is not achieved. We stop process if:
-
-		   - if expire reduced to zero. Otherwise, expire is halfed.
-		   - if table is not full.
-		   - if we are called from interrupt.
-		   - jiffies check is just fallback/debug loop breaker.
-		     We will not spin here for long time in any case.
-		 */
-
-		RT_CACHE_STAT_INC(gc_goal_miss);
-
-		if (expire == 0)
-			break;
-
-		expire >>= 1;
-
-		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-			goto out;
-	} while (!in_softirq() && time_before_eq(jiffies, now));
-
-	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	net_warn_ratelimited("dst cache overflow\n");
-	RT_CACHE_STAT_INC(gc_dst_overflow);
-	return 1;
-
-work_done:
-	expire += ip_rt_gc_min_interval;
-	if (expire > ip_rt_gc_timeout ||
-	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-		expire = ip_rt_gc_timeout;
-out:	return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-	int length = 0;
-	const struct rtable *rth = head;
-
-	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-	}
-	return length >> FRACT_BITS;
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
-				     struct sk_buff *skb, int ifindex)
-{
-	struct rtable	*rth, *cand;
-	struct rtable __rcu **rthp, **candp;
-	unsigned long	now;
-	u32 		min_score;
-	int		chain_length;
-
-restart:
-	chain_length = 0;
-	min_score = ~(u32)0;
-	cand = NULL;
-	candp = NULL;
-	now = jiffies;
-
-	if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
-		/*
-		 * If we're not caching, just tell the caller we
-		 * were successful and don't touch the route.  The
-		 * caller hold the sole reference to the cache entry, and
-		 * it will be released when the caller is done with it.
-		 * If we drop it here, the callers have no way to resolve routes
-		 * when we're not caching.  Instead, just point *rp at rt, so
-		 * the caller gets a single use out of the route
-		 * Note that we do rt_free on this new route entry, so that
-		 * once its refcount hits zero, we are still able to reap it
-		 * (Thanks Alexey)
-		 * Note: To avoid expensive rcu stuff for this uncached dst,
-		 * we set DST_NOCACHE so that dst_release() can free dst without
-		 * waiting a grace period.
-		 */
-
-		rt->dst.flags |= DST_NOCACHE;
-		goto skip_hashing;
-	}
-
-	rthp = &rt_hash_table[hash].chain;
-
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
-			continue;
-		}
-		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
-			/* Put it first */
-			*rthp = rth->dst.rt_next;
-			/*
-			 * Since lookup is lockfree, the deletion
-			 * must be visible to another weakly ordered CPU before
-			 * the insertion at the start of the hash chain.
-			 */
-			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
-			/*
-			 * Since lookup is lockfree, the update writes
-			 * must be ordered for consistency on SMP.
-			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
-			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			rt_drop(rt);
-			if (skb)
-				skb_dst_set(skb, &rth->dst);
-			return rth;
-		}
-
-		if (!atomic_read(&rth->dst.__refcnt)) {
-			u32 score = rt_score(rth);
-
-			if (score <= min_score) {
-				cand = rth;
-				candp = rthp;
-				min_score = score;
-			}
-		}
-
-		chain_length++;
-
-		rthp = &rth->dst.rt_next;
-	}
-
-	if (cand) {
-		/* ip_rt_gc_elasticity used to be average length of chain
-		 * length, when exceeded gc becomes really aggressive.
-		 *
-		 * The second limit is less certain. At the moment it allows
-		 * only 2 entries per bucket. We will see.
-		 */
-		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
-			rt_free(cand);
-		}
-	} else {
-		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->dst.dev);
-			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(net)) {
-				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->dst.dev->name, num);
-			}
-			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-					ifindex, rt_genid(net));
-			goto restart;
-		}
-	}
-
-	rt->dst.rt_next = rt_hash_table[hash].chain;
-
-	/*
-	 * Since lookup is lockfree, we must make sure
-	 * previous writes to rt are committed to memory
-	 * before making rt visible to other CPUS.
-	 */
-	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
-	if (skb)
-		skb_dst_set(skb, &rt->dst);
-	return rt;
-}
-
 /*
  * Peer allocation may fail only in serious out-of-memory conditions.  However
  * we still can generate some output.
@@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned int hash, struct rtable *rt)
-{
-	struct rtable __rcu **rthp;
-	struct rtable *aux;
-
-	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	ip_rt_put(rt);
-	while ((aux = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
-			rt_free(aux);
-			continue;
-		}
-		rthp = &aux->dst.rt_next;
-	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 			     const struct iphdr *iph,
 			     int oif, u8 tos,
@@ -1518,10 +774,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   rt->dst.expires) {
-			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-						rt->rt_oif,
-						rt_genid(dev_net(dst->dev)));
-			rt_del(hash, rt);
+			ip_rt_put(rt);
 			ret = NULL;
 		}
 	}
@@ -1969,7 +1222,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-			 DST_HOST |
+			 DST_HOST | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1978,7 +1231,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned int hash;
 	struct rtable *rth;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
@@ -2042,9 +1294,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
-	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
+	skb_dst_set(skb, &rth->dst);
+	return 0;
 
 e_nobufs:
 	return -ENOBUFS;
@@ -2176,7 +1427,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 	struct rtable *rth = NULL;
 	int err;
-	unsigned int hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1)
@@ -2188,12 +1438,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	if (err)
 		return err;
 
-	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
-		       rt_genid(dev_net(rth->dst.dev)));
-	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
-	if (IS_ERR(rth))
-		return PTR_ERR(rth);
+	skb_dst_set(skb, &rth->dst);
 	return 0;
 }
 
@@ -2217,7 +1462,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned int	flags = 0;
 	u32		itag = 0;
 	struct rtable	*rth;
-	unsigned int	hash;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
 
@@ -2339,11 +1583,8 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+	skb_dst_set(skb, &rth->dst);
 	err = 0;
-	if (IS_ERR(rth))
-		err = PTR_ERR(rth);
 	goto out;
 
 no_route:
@@ -2382,46 +1623,10 @@ martian_source_keep_err:
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			   u8 tos, struct net_device *dev, bool noref)
 {
-	struct rtable	*rth;
-	unsigned int	hash;
-	int iif = dev->ifindex;
-	struct net *net;
 	int res;
 
-	net = dev_net(dev);
-
 	rcu_read_lock();
 
-	if (!rt_caching(net))
-		goto skip_cache;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->dst.rt_next)) {
-		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-		     (rth->rt_route_iif ^ iif) |
-		     (rth->rt_key_tos ^ tos)) == 0 &&
-		    rth->rt_mark == skb->mark &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			if (noref) {
-				dst_use_noref(&rth->dst, jiffies);
-				skb_dst_set_noref(skb, &rth->dst);
-			} else {
-				dst_use(&rth->dst, jiffies);
-				skb_dst_set(skb, &rth->dst);
-			}
-			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
-			return 0;
-		}
-		RT_CACHE_STAT_INC(in_hlist_search);
-	}
-
-skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2563,10 +1768,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 /*
  * Major route resolver routine.
- * called with rcu_read_lock();
  */
 
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
 	struct net_device *dev_out = NULL;
 	__u8 tos = RT_FL_TOS(fl4);
@@ -2746,57 +1950,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 make_route:
 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
 			       tos, dev_out, flags);
-	if (!IS_ERR(rth)) {
-		unsigned int hash;
-
-		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
-			       rt_genid(dev_net(dev_out)));
-		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
-	}
 
 out:
 	rcu_read_unlock();
 	return rth;
 }
-
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
-	struct rtable *rth;
-	unsigned int hash;
-
-	if (!rt_caching(net))
-		goto slow_output;
-
-	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->dst.rt_next)) {
-		if (rth->rt_key_dst == flp4->daddr &&
-		    rth->rt_key_src == flp4->saddr &&
-		    rt_is_output_route(rth) &&
-		    rth->rt_oif == flp4->flowi4_oif &&
-		    rth->rt_mark == flp4->flowi4_mark &&
-		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
-			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			dst_use(&rth->dst, jiffies);
-			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
-			if (!flp4->saddr)
-				flp4->saddr = rth->rt_src;
-			if (!flp4->daddr)
-				flp4->daddr = rth->rt_dst;
-			return rth;
-		}
-		RT_CACHE_STAT_INC(out_hlist_search);
-	}
-	rcu_read_unlock_bh();
-
-slow_output:
-	return ip_route_output_slow(net, flp4);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -3106,43 +2264,6 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-	struct rtable *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct net *net;
-
-	net = sock_net(skb->sk);
-
-	s_h = cb->args[0];
-	if (s_h < 0)
-		s_h = 0;
-	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-		if (!rt_hash_table[h].chain)
-			continue;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-				continue;
-			if (rt_is_expired(rt))
-				continue;
-			skb_dst_set_noref(skb, &rt->dst);
-			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					 1, NLM_F_MULTI) <= 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
 	return skb->len;
 }
 
@@ -3376,22 +2497,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
 
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
-	ssize_t ret;
-
-	if (!str)
-		return 0;
-
-	ret = kstrtoul(str, 0, &rhash_entries);
-	if (ret)
-		return 0;
-
-	return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3414,31 +2519,12 @@ int __init ip_rt_init(void)
 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(totalram_pages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					0,
-					rhash_entries ? 0 : 512 * 1024);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
 
 	devinet_init();
 	ip_fib_init();
 
-	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-	expires_ljiffies = jiffies;
-	schedule_delayed_work(&expires_work,
-		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
 	if (ip_rt_proc_init())
 		pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
cgit v1.2.3


From 38a424e4657462fe9f8b76f01a0e879abde99ab4 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 1 Jul 2012 02:02:53 +0000
Subject: ipv4: Kill ip_route_input_noref().

The "noref" argument to ip_route_input_common() is now always ignored
because we do not cache routes, and in that case we must always grab
a reference to the resulting 'dst'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h    | 16 ++--------------
 net/ipv4/arp.c         |  2 +-
 net/ipv4/ip_fragment.c |  4 ++--
 net/ipv4/ip_input.c    |  4 ++--
 net/ipv4/route.c       |  6 +++---
 net/ipv4/xfrm4_input.c |  4 ++--
 6 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 5dcfeb621e06..5c86c4773b2b 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	return ip_route_output_key(net, fl4);
 }
 
-extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
-
-static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
-}
-
-static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
-}
+extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+			  u8 tos, struct net_device *devin);
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			     int oif, u32 mark, u8 protocol, int flow_flags);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757d..c38293f38161 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..7ad88e5e7110 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
 		/* skb dst is stale, drop it, and perform route lookup again */
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+		err = ip_route_input(head, iph->daddr, iph->saddr,
+				     iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d4440f523..4ebc6feee250 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+		int err = ip_route_input(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EXDEV)
 				NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6d6146d31f22..55eb4634ed60 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1620,8 +1620,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1664,7 +1664,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..58d23a572509 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.3


From 1a00fee4ffb22312a0ac40045ecd6f222b55eb3d Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 1 Jul 2012 02:02:56 +0000
Subject: ipv4: Remove rt_key_{src,dst,tos} from struct rtable.

They are always used in contexts where they can be reconstituted,
or where the finally resolved rt->rt_{src,dst} is semantically
equivalent.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |  5 -----
 net/ipv4/route.c        | 39 +++++++++------------------------------
 net/ipv4/xfrm4_policy.c |  3 ---
 3 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 5c86c4773b2b..935fa59630b0 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -44,14 +44,9 @@ struct fib_info;
 struct rtable {
 	struct dst_entry	dst;
 
-	/* Lookup key. */
-	__be32			rt_key_dst;
-	__be32			rt_key_src;
-
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u8			rt_key_tos;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 55eb4634ed60..c89d690acdfa 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1268,12 +1268,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	rth->dst.output = ip_rt_bug;
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1392,12 +1389,9 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
@@ -1563,12 +1557,9 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 
-	rth->rt_key_dst	= daddr;
-	rth->rt_key_src	= saddr;
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_key_tos	= tos;
 	rth->rt_dst	= daddr;
 	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
@@ -1668,9 +1659,7 @@ EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-				       const struct flowi4 *fl4,
-				       __be32 orig_daddr, __be32 orig_saddr,
-				       int orig_oif, __u8 orig_rtos,
+				       const struct flowi4 *fl4, int orig_oif,
 				       struct net_device *dev_out,
 				       unsigned int flags)
 {
@@ -1721,12 +1710,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rth->dst.output = ip_output;
 
-	rth->rt_key_dst	= orig_daddr;
-	rth->rt_key_src	= orig_saddr;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_key_tos	= orig_rtos;
 	rth->rt_dst	= fl4->daddr;
 	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
@@ -1777,16 +1763,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
-	__be32 orig_daddr;
-	__be32 orig_saddr;
 	int orig_oif;
 
 	res.tclassid	= 0;
 	res.fi		= NULL;
 	res.table	= NULL;
 
-	orig_daddr = fl4->daddr;
-	orig_saddr = fl4->saddr;
 	orig_oif = fl4->flowi4_oif;
 
 	fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -1948,8 +1930,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-			       tos, dev_out, flags);
+	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 
 out:
 	rcu_read_unlock();
@@ -2014,9 +1995,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_key_dst = ort->rt_key_dst;
-		rt->rt_key_src = ort->rt_key_src;
-		rt->rt_key_tos = ort->rt_key_tos;
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
@@ -2058,7 +2036,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2077,7 +2055,7 @@ static int rt_fill_info(struct net *net,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_tos	= tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2090,9 +2068,9 @@ static int rt_fill_info(struct net *net,
 
 	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
 		goto nla_put_failure;
-	if (rt->rt_key_src) {
+	if (src) {
 		r->rtm_src_len = 32;
-		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+		if (nla_put_be32(skb, RTA_SRC, src))
 			goto nla_put_failure;
 	}
 	if (rt->dst.dev &&
@@ -2104,7 +2082,7 @@ static int rt_fill_info(struct net *net,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != rt->rt_key_src) {
+	    rt->rt_src != src) {
 		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 			goto nla_put_failure;
 	}
@@ -2248,7 +2226,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
 		goto errout_free;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index fcf7678bc009..2a8d5cfc340f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_key_dst = fl4->daddr;
-	xdst->u.rt.rt_key_src = fl4->saddr;
-	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-- 
cgit v1.2.3


From d6c0a4f609847d6e65658913f9ccbcb1c137cff3 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 1 Jul 2012 02:02:59 +0000
Subject: ipv4: Kill 'rt_src' from 'struct rtable'

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |  1 -
 net/ipv4/route.c        | 34 +++++++++++++++-------------------
 net/ipv4/xfrm4_policy.c |  1 -
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 935fa59630b0..85d1093e42de 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -49,7 +49,6 @@ struct rtable {
 	__u16			rt_type;
 
 	__be32			rt_dst;	/* Path destination	*/
-	__be32			rt_src;	/* Path source		*/
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c89d690acdfa..fc1199dc23e7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1272,7 +1272,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1393,7 +1392,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
@@ -1561,7 +1559,6 @@ local_input:
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1714,7 +1711,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_dst	= fl4->daddr;
-	rth->rt_src	= fl4->saddr;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
@@ -2005,7 +2001,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_dst = ort->rt_dst;
-		rt->rt_src = ort->rt_src;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->fi = ort->fi;
 		if (rt->fi)
@@ -2036,7 +2031,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
+static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
@@ -2055,7 +2050,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
-	r->rtm_tos	= tos;
+	r->rtm_tos	= fl4->flowi4_tos;
 	r->rtm_table	= RT_TABLE_MAIN;
 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
 		goto nla_put_failure;
@@ -2082,11 +2077,11 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		goto nla_put_failure;
 #endif
 	if (!rt_is_input_route(rt) &&
-	    rt->rt_src != src) {
-		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+	    fl4->saddr != src) {
+		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_dst != rt->rt_gateway &&
+	if (fl4->daddr != rt->rt_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
@@ -2116,7 +2111,7 @@ static int rt_fill_info(struct net *net,  __be32 src, u8 tos,
 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
 			int err = ipmr_get_route(net, skb,
-						 rt->rt_src, rt->rt_dst,
+						 fl4->saddr, fl4->daddr,
 						 r, nowait);
 			if (err <= 0) {
 				if (!nowait) {
@@ -2151,6 +2146,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
 	struct rtable *rt = NULL;
+	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
 	u32 iif;
@@ -2185,6 +2181,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst;
+	fl4.saddr = src;
+	fl4.flowi4_tos = rtm->rtm_tos;
+	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+	fl4.flowi4_mark = mark;
+
 	if (iif) {
 		struct net_device *dev;
 
@@ -2205,13 +2208,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		if (err == 0 && rt->dst.error)
 			err = -rt->dst.error;
 	} else {
-		struct flowi4 fl4 = {
-			.daddr = dst,
-			.saddr = src,
-			.flowi4_tos = rtm->rtm_tos,
-			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-			.flowi4_mark = mark,
-		};
 		rt = ip_route_output_key(net, &fl4);
 
 		err = 0;
@@ -2226,7 +2222,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, src, rtm->rtm_tos, skb,
+	err = rt_fill_info(net, src, &fl4, skb,
 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 2a8d5cfc340f..00d49e415113 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,7 +92,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_src = rt->rt_src;
 	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-- 
cgit v1.2.3


From b48698895de86e07b685f8e4b8db0f1cd5a97e9a Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Sun, 1 Jul 2012 02:03:01 +0000
Subject: ipv4: Remove 'rt_mark' from 'struct rtable'

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     | 1 -
 net/ipv4/ipmr.c         | 2 +-
 net/ipv4/route.c        | 9 ++-------
 net/ipv4/xfrm4_policy.c | 1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 85d1093e42de..757fe40b6cea 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -52,7 +52,6 @@ struct rtable {
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
-	__u32			rt_mark;
 
 	/* Info on neighbour */
 	__be32			rt_gateway;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5716c6b808d6..eee3bf6676fe 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1797,7 +1797,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.flowi4_tos = RT_TOS(iph->tos),
 		.flowi4_oif = rt->rt_oif,
 		.flowi4_iif = rt->rt_iif,
-		.flowi4_mark = rt->rt_mark,
+		.flowi4_mark = skb->mark,
 	};
 	struct mr_table *mrt;
 	int err;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fc1199dc23e7..264617c98c25 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,7 +1275,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1395,7 +1394,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1562,7 +1560,6 @@ local_input:
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
-	rth->rt_mark    = skb->mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= daddr;
 	rth->fi = NULL;
@@ -1714,7 +1711,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
-	rth->rt_mark    = fl4->flowi4_mark;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = fl4->daddr;
 	rth->fi = NULL;
@@ -1994,7 +1990,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_oif = ort->rt_oif;
-		rt->rt_mark = ort->rt_mark;
 		rt->rt_pmtu = ort->rt_pmtu;
 
 		rt->rt_genid = rt_genid(net);
@@ -2091,8 +2086,8 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
-	if (rt->rt_mark &&
-	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+	if (fl4->flowi4_mark &&
+	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
 	error = rt->dst.error;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 00d49e415113..f73ba8210bd3 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -82,7 +82,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_oif = fl4->flowi4_oif;
-	xdst->u.rt.rt_mark = fl4->flowi4_mark;
 
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
-- 
cgit v1.2.3


From f1ce3062c53809d862d8a04e7a0566c3cc4e0bda Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 12 Jul 2012 10:10:17 -0700
Subject: ipv4: Remove 'rt_dst' from 'struct rtable'

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |  1 -
 net/ipv4/route.c        | 45 +++++++++------------------------------------
 net/ipv4/xfrm4_policy.c |  1 -
 3 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 757fe40b6cea..6d111bceb160 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -48,7 +48,6 @@ struct rtable {
 	unsigned int		rt_flags;
 	__u16			rt_type;
 
-	__be32			rt_dst;	/* Path destination	*/
 	int			rt_route_iif;
 	int			rt_iif;
 	int			rt_oif;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 264617c98c25..85d103fee88e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -850,7 +850,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		    peer->rate_tokens == ip_rt_redirect_number)
 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 					     &ip_hdr(skb)->saddr, rt->rt_iif,
-					     &rt->rt_dst, &rt->rt_gateway);
+					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
 #endif
 	}
 out_put_peer:
@@ -1132,8 +1132,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = dst->dev->mtu;
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-
-		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+		if (rt->rt_gateway != 0 && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1271,7 +1270,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1390,7 +1388,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
@@ -1556,7 +1553,6 @@ local_input:
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_dst	= daddr;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
@@ -1707,7 +1703,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_dst	= fl4->daddr;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
@@ -1995,7 +1990,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_genid = rt_genid(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
-		rt->rt_dst = ort->rt_dst;
 		rt->rt_gateway = ort->rt_gateway;
 		rt->fi = ort->fi;
 		if (rt->fi)
@@ -2026,9 +2020,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
-			struct sk_buff *skb, u32 pid, u32 seq, int event,
-			int nowait, unsigned int flags)
+static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
+			struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+			u32 seq, int event, int nowait, unsigned int flags)
 {
 	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
@@ -2056,7 +2050,7 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	if (rt->rt_flags & RTCF_NOTIFY)
 		r->rtm_flags |= RTM_F_NOTIFY;
 
-	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+	if (nla_put_be32(skb, RTA_DST, dst))
 		goto nla_put_failure;
 	if (src) {
 		r->rtm_src_len = 32;
@@ -2100,29 +2094,8 @@ static int rt_fill_info(struct net *net,  __be32 src, struct flowi4 *fl4,
 	}
 
 	if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
-		__be32 dst = rt->rt_dst;
-
-		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
-			int err = ipmr_get_route(net, skb,
-						 fl4->saddr, fl4->daddr,
-						 r, nowait);
-			if (err <= 0) {
-				if (!nowait) {
-					if (err == 0)
-						return 0;
-					goto nla_put_failure;
-				} else {
-					if (err == -EMSGSIZE)
-						goto nla_put_failure;
-					error = err;
-				}
-			}
-		} else
-#endif
-			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-				goto nla_put_failure;
+		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+			goto nla_put_failure;
 	}
 
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -2217,7 +2190,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	err = rt_fill_info(net, src, &fl4, skb,
+	err = rt_fill_info(net, dst, src, &fl4, skb,
 			   NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 			   RTM_NEWROUTE, 0, 0);
 	if (err <= 0)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index f73ba8210bd3..6074b694f118 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-	xdst->u.rt.rt_dst = rt->rt_dst;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 
-- 
cgit v1.2.3


From f8126f1d5136be1ca1a3536d43ad7a710b5620f8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 13 Jul 2012 05:03:45 -0700
Subject: ipv4: Adjust semantics of rt->rt_gateway.

In order to allow prefixed routes, we have to adjust how rt_gateway
is set and interpreted.

The new interpretation is:

1) rt_gateway == 0, destination is on-link, nexthop is iph->daddr

2) rt_gateway != 0, destination requires a nexthop gateway

Abstract the fetching of the proper nexthop value using a new
inline helper, rt_nexthop(), as suggested by Joe Perches.

Signed-off-by: David S. Miller <davem@davemloft.net>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 include/net/route.h                 |  7 +++++++
 net/ipv4/arp.c                      |  3 +--
 net/ipv4/inet_connection_sock.c     |  4 ++--
 net/ipv4/ip_gre.c                   |  2 +-
 net/ipv4/ip_output.c                |  2 +-
 net/ipv4/ipip.c                     |  2 +-
 net/ipv4/netfilter/ipt_MASQUERADE.c |  5 +++--
 net/ipv4/route.c                    | 17 +++++++++--------
 8 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 6d111bceb160..3c1eeab9749b 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -70,6 +70,13 @@ static inline bool rt_is_output_route(const struct rtable *rt)
 	return rt->rt_route_iif == 0;
 }
 
+static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
+{
+	if (rt->rt_gateway)
+		return rt->rt_gateway;
+	return daddr;
+}
+
 struct ip_rt_acct {
 	__u32 	o_bytes;
 	__u32 	o_packets;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c38293f38161..a0124eb7dbea 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 		return 1;
 	}
 
-	paddr = skb_rtable(skb)->rt_gateway;
-
+	paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
 	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
 			       paddr, dev))
 		return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c7a4de05ca04..0a290d719bc7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -389,7 +389,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
 		goto route_err;
 	return &rt->dst;
 
@@ -422,7 +422,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
 		goto route_err;
 	return &rt->dst;
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 42c44b1403c9..b062a98574f2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -766,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 
 		if (skb->protocol == htons(ETH_P_IP)) {
 			rt = skb_rtable(skb);
-			dst = rt->rt_gateway;
+			dst = rt_nexthop(rt, old_iph->daddr);
 		}
 #if IS_ENABLED(CONFIG_IPV6)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c528f841ca4b..4494015f7e32 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2c2c35bace76..99af1f0cc658 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -487,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 			dev->stats.tx_fifo_errors++;
 			goto tx_error;
 		}
-		dst = rt->rt_gateway;
+		dst = rt_nexthop(rt, old_iph->daddr);
 	}
 
 	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c79dc87..cbb6a1a6f6f7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_nat_ipv4_range newrange;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 	const struct rtable *rt;
-	__be32 newsrc;
+	__be32 newsrc, nh;
 
 	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
 
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	mr = par->targinfo;
 	rt = skb_rtable(skb);
-	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+	newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
 	if (!newsrc) {
 		pr_info("%s ate my IP address\n", par->out->name);
 		return NF_DROP;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85d103fee88e..d1d579638092 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1085,8 +1085,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
 		else
-			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
-					RT_SCOPE_UNIVERSE);
+			src = inet_select_addr(rt->dst.dev,
+					       rt_nexthop(rt, iph->daddr),
+					       RT_SCOPE_UNIVERSE);
 		rcu_read_unlock();
 	}
 	memcpy(addr, &src, 4);
@@ -1132,7 +1133,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = dst->dev->mtu;
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-		if (rt->rt_gateway != 0 && mtu > 576)
+		if (rt->rt_gateway && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1274,7 +1275,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
@@ -1392,7 +1393,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_oif 	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
@@ -1557,7 +1558,7 @@ local_input:
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway	= daddr;
+	rth->rt_gateway	= 0;
 	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1707,7 +1708,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_oif	= orig_oif;
 	rth->rt_pmtu	= 0;
-	rth->rt_gateway = fl4->daddr;
+	rth->rt_gateway = 0;
 	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2070,7 +2071,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (fl4->daddr != rt->rt_gateway &&
+	if (rt->rt_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From f5b0a8743601a4477419171f5046bd07d1c080a0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 19 Jul 2012 12:31:33 -0700
Subject: net: Document dst->obsolete better.

Add a big comment explaining how the field works, and use defines
instead of magic constants for the values assigned to it.

Suggested by Joe Perches.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h      | 14 +++++++++++++-
 net/core/dst.c         |  4 ++--
 net/decnet/dn_route.c  |  4 ++--
 net/ipv4/route.c       |  5 +++--
 net/ipv6/route.c       |  4 ++--
 net/sctp/transport.c   |  2 +-
 net/xfrm/xfrm_policy.c | 23 ++++++++++++-----------
 7 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst.h b/include/net/dst.h
index 51610468c63d..0df661a0c295 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -65,7 +65,19 @@ struct dst_entry {
 	unsigned short		pending_confirm;
 
 	short			error;
+
+	/* A non-zero value of dst->obsolete forces by-hand validation
+	 * of the route entry.  Positive values are set by the generic
+	 * dst layer to indicate that the entry has been forcefully
+	 * destroyed.
+	 *
+	 * Negative values are used by the implementation layer code to
+	 * force invocation of the dst_ops->check() method.
+	 */
 	short			obsolete;
+#define DST_OBSOLETE_NONE	0
+#define DST_OBSOLETE_DEAD	2
+#define DST_OBSOLETE_FORCE_CHK	-1
 	unsigned short		header_len;	/* more space at head required */
 	unsigned short		trailer_len;	/* space to reserve at tail */
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -359,7 +371,7 @@ extern struct dst_entry *dst_destroy(struct dst_entry *dst);
 
 static inline void dst_free(struct dst_entry *dst)
 {
-	if (dst->obsolete > 1)
+	if (dst->obsolete > 0)
 		return;
 	if (!atomic_read(&dst->__refcnt)) {
 		dst = dst_destroy(dst);
diff --git a/net/core/dst.c b/net/core/dst.c
index 07bacff84aa4..069d51d29414 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -94,7 +94,7 @@ loop:
 			 * But we do not have state "obsoleted, but
 			 * referenced by parent", so it is right.
 			 */
-			if (dst->obsolete > 1)
+			if (dst->obsolete > 0)
 				continue;
 
 			___dst_free(dst);
@@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst)
 	 */
 	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
 		dst->input = dst->output = dst_discard;
-	dst->obsolete = 2;
+	dst->obsolete = DST_OBSOLETE_DEAD;
 }
 
 void __dst_free(struct dst_entry *dst)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 47de90d8fe94..23cc11dd4e40 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1176,7 +1176,7 @@ make_route:
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
-	rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST);
+	rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
 	if (rt == NULL)
 		goto e_nobufs;
 
@@ -1444,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
 	}
 
 make_route:
-	rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST);
+	rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
 	if (rt == NULL)
 		goto e_nobufs;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d1d579638092..50d2498c9284 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1221,7 +1221,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 static struct rtable *rt_dst_alloc(struct net_device *dev,
 				   bool nopolicy, bool noxfrm)
 {
-	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
+	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
 			 DST_HOST | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
@@ -1969,9 +1969,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
-	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
 	struct rtable *ort = (struct rtable *) dst_orig;
+	struct rtable *rt;
 
+	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
 	if (rt) {
 		struct dst_entry *new = &rt->dst;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 84f6564dd372..cf02cb97bbdd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -281,7 +281,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
 					     struct fib6_table *table)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
-					0, 0, flags);
+					0, DST_OBSOLETE_NONE, flags);
 
 	if (rt) {
 		struct dst_entry *dst = &rt->dst;
@@ -985,7 +985,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 	struct dst_entry *new = NULL;
 
-	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
+	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
 	if (rt) {
 		new = &rt->dst;
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a6b7ee9ce28a..ec3a12b9b802 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -216,7 +216,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport,
 void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 {
 	/* If we don't have a fresh route, look one up */
-	if (!transport->dst || transport->dst->obsolete > 1) {
+	if (!transport->dst || transport->dst->obsolete) {
 		dst_release(transport->dst);
 		transport->af_specific->get_dst(transport, &transport->saddr,
 						&transport->fl, sk);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 65bd1ca51517..c5a5165a5927 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1350,7 +1350,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 	default:
 		BUG();
 	}
-	xdst = dst_alloc(dst_ops, NULL, 0, 0, 0);
+	xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
 
 	if (likely(xdst)) {
 		struct dst_entry *dst = &xdst->u.dst;
@@ -1477,7 +1477,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst1->xfrm = xfrm[i];
 		xdst->xfrm_genid = xfrm[i]->genid;
 
-		dst1->obsolete = -1;
+		dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
 		dst1->flags |= DST_HOST;
 		dst1->lastuse = now;
 
@@ -2219,12 +2219,13 @@ EXPORT_SYMBOL(__xfrm_route_forward);
 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
-	 * to "-1" to force all XFRM destinations to get validated by
-	 * dst_ops->check on every use.  We do this because when a
-	 * normal route referenced by an XFRM dst is obsoleted we do
-	 * not go looking around for all parent referencing XFRM dsts
-	 * so that we can invalidate them.  It is just too much work.
-	 * Instead we make the checks here on every use.  For example:
+	 * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
+	 * get validated by dst_ops->check on every use.  We do this
+	 * because when a normal route referenced by an XFRM dst is
+	 * obsoleted we do not go looking around for all parent
+	 * referencing XFRM dsts so that we can invalidate them.  It
+	 * is just too much work.  Instead we make the checks here on
+	 * every use.  For example:
 	 *
 	 *	XFRM dst A --> IPv4 dst X
 	 *
@@ -2234,9 +2235,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 	 * stale_bundle() check.
 	 *
 	 * When a policy's bundle is pruned, we dst_free() the XFRM
-	 * dst which causes it's ->obsolete field to be set to a
-	 * positive non-zero integer.  If an XFRM dst has been pruned
-	 * like this, we want to force a new route lookup.
+	 * dst which causes it's ->obsolete field to be set to
+	 * DST_OBSOLETE_DEAD.  If an XFRM dst has been pruned like
+	 * this, we want to force a new route lookup.
 	 */
 	if (dst->obsolete < 0 && !stale_bundle(dst))
 		return dst;
-- 
cgit v1.2.3


From ceb3320610d6f15ff20dd4c042b36473d77de76f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 11:31:28 -0700
Subject: ipv4: Kill routes during PMTU/redirect updates.

Mark them obsolete so there will be a re-lookup to fetch the
FIB nexthop exception info.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h |  1 +
 net/ipv4/route.c  | 41 +++++++++++++++++++++++++++++------------
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst.h b/include/net/dst.h
index 0df661a0c295..baf597890064 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -78,6 +78,7 @@ struct dst_entry {
 #define DST_OBSOLETE_NONE	0
 #define DST_OBSOLETE_DEAD	2
 #define DST_OBSOLETE_FORCE_CHK	-1
+#define DST_OBSOLETE_KILL	-2
 	unsigned short		header_len;	/* more space at head required */
 	unsigned short		trailer_len;	/* space to reserve at tail */
 #ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 50d2498c9284..d52f7699c2fa 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -673,7 +673,8 @@ out_unlock:
 	return;
 }
 
-static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+			     bool kill_route)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
@@ -728,8 +729,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 						      0, 0);
 			}
-			rt->rt_gateway = new_gw;
-			rt->rt_flags |= RTCF_REDIRECTED;
+			if (kill_route)
+				rt->dst.obsolete = DST_OBSOLETE_KILL;
 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 		}
 		neigh_release(n);
@@ -760,7 +761,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 	rt = (struct rtable *) dst;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
-	__ip_do_redirect(rt, skb, &fl4);
+	__ip_do_redirect(rt, skb, &fl4, true);
 }
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -919,7 +920,7 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct fib_result res;
 
@@ -932,8 +933,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 				      jiffies + ip_rt_mtu_expires);
 	}
-	rt->rt_pmtu = mtu;
-	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+	return mtu;
 }
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -943,7 +943,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 	struct flowi4 fl4;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
-	__ip_rt_update_pmtu(rt, &fl4, mtu);
+	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
+
+	if (!rt->rt_pmtu) {
+		dst->obsolete = DST_OBSOLETE_KILL;
+	} else {
+		rt->rt_pmtu = mtu;
+		dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+	}
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -989,7 +996,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		__ip_do_redirect(rt, skb, &fl4);
+		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
 	}
 }
@@ -1004,7 +1011,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 	rt = __ip_route_output_key(sock_net(sk), &fl4);
 	if (!IS_ERR(rt)) {
-		__ip_do_redirect(rt, skb, &fl4);
+		__ip_do_redirect(rt, skb, &fl4, false);
 		ip_rt_put(rt);
 	}
 }
@@ -1014,7 +1021,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (rt_is_expired(rt))
+	/* All IPV4 dsts are created with ->obsolete set to the value
+	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+	 * into this function always.
+	 *
+	 * When a PMTU/redirect information update invalidates a
+	 * route, this is indicated by setting obsolete to
+	 * DST_OBSOLETE_KILL.
+	 */
+	if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
 		return NULL;
 	return dst;
 }
@@ -1186,8 +1201,10 @@ restart:
 				dst_set_expires(&rt->dst, diff);
 			}
 		}
-		if (gw)
+		if (gw) {
+			rt->rt_flags |= RTCF_REDIRECTED;
 			rt->rt_gateway = gw;
+		}
 		fnhe->fnhe_stamp = jiffies;
 		break;
 	}
-- 
cgit v1.2.3


From f2bb4bedf35d5167a073dcdddf16543f351ef3ae Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 12:20:47 -0700
Subject: ipv4: Cache output routes in fib_info nexthops.

If we have an output route that lacks nexthop exceptions, we can cache
it in the FIB info nexthop.

Such routes will have DST_HOST cleared because such routes refer to a
family of destinations, rather than just one.

The sequence of the handling of exceptions during route lookup is
adjusted to make the logic work properly.

Before we allocate the route, we lookup the exception.

Then we know if we will cache this route or not, and therefore whether
DST_HOST should be set on the allocated route.

Then we use DST_HOST to key off whether we should store the resulting
route, during rt_set_nexthop(), in the FIB nexthop cache.

With help from Eric Dumazet.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |   2 +
 net/ipv4/fib_semantics.c |   2 +
 net/ipv4/route.c         | 140 ++++++++++++++++++++++++++++++++---------------
 3 files changed, 101 insertions(+), 43 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 2daf096dfc60..fb62c590360e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -46,6 +46,7 @@ struct fib_config {
  };
 
 struct fib_info;
+struct rtable;
 
 struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
@@ -80,6 +81,7 @@ struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
+	struct rtable		*nh_rth_output;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b57d768240d..83d0f42b619a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -171,6 +171,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			dev_put(nexthop_nh->nh_dev);
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
+		if (nexthop_nh->nh_rth_output)
+			dst_release(&nexthop_nh->nh_rth_output->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d52f7699c2fa..8a0260010ea1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1158,8 +1158,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	return mtu;
 }
 
-static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
-			    struct fib_info *fi)
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
 {
 	if (fi->fib_metrics != (u32 *) dst_default_metrics) {
 		rt->fi = fi;
@@ -1168,50 +1167,83 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
-static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
 	struct fib_nh_exception *fnhe;
 	u32 hval;
 
+	if (!hash)
+		return NULL;
+
 	hval = fnhe_hashfun(daddr);
 
-restart:
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		__be32 fnhe_daddr, gw;
-		unsigned long expires;
-		unsigned int seq;
-		u32 pmtu;
-
-		seq = read_seqbegin(&fnhe_seqlock);
-		fnhe_daddr = fnhe->fnhe_daddr;
-		gw = fnhe->fnhe_gw;
-		pmtu = fnhe->fnhe_pmtu;
-		expires = fnhe->fnhe_expires;
-		if (read_seqretry(&fnhe_seqlock, seq))
-			goto restart;
-		if (daddr != fnhe_daddr)
-			continue;
-		if (pmtu) {
-			unsigned long diff = expires - jiffies;
+		if (fnhe->fnhe_daddr == daddr)
+			return fnhe;
+	}
+	return NULL;
+}
 
-			if (time_before(jiffies, expires)) {
-				rt->rt_pmtu = pmtu;
-				dst_set_expires(&rt->dst, diff);
-			}
-		}
-		if (gw) {
-			rt->rt_flags |= RTCF_REDIRECTED;
-			rt->rt_gateway = gw;
+static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+			      __be32 daddr)
+{
+	__be32 fnhe_daddr, gw;
+	unsigned long expires;
+	unsigned int seq;
+	u32 pmtu;
+
+restart:
+	seq = read_seqbegin(&fnhe_seqlock);
+	fnhe_daddr = fnhe->fnhe_daddr;
+	gw = fnhe->fnhe_gw;
+	pmtu = fnhe->fnhe_pmtu;
+	expires = fnhe->fnhe_expires;
+	if (read_seqretry(&fnhe_seqlock, seq))
+		goto restart;
+
+	if (daddr != fnhe_daddr)
+		return;
+
+	if (pmtu) {
+		unsigned long diff = expires - jiffies;
+
+		if (time_before(jiffies, expires)) {
+			rt->rt_pmtu = pmtu;
+			dst_set_expires(&rt->dst, diff);
 		}
-		fnhe->fnhe_stamp = jiffies;
-		break;
+	}
+	if (gw) {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		rt->rt_gateway = gw;
+	}
+	fnhe->fnhe_stamp = jiffies;
+}
+
+static inline void rt_release_rcu(struct rcu_head *head)
+{
+	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+	dst_release(dst);
+}
+
+static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
+
+	orig = *p;
+
+	prev = cmpxchg(p, orig, rt);
+	if (prev == orig) {
+		dst_clone(&rt->dst);
+		if (orig)
+			call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
 	}
 }
 
-static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			   const struct fib_result *res,
+			   struct fib_nh_exception *fnhe,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
 	if (fi) {
@@ -1219,12 +1251,15 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = nh->nh_gw;
-		if (unlikely(nh->nh_exceptions))
-			rt_bind_exception(rt, nh, fl4->daddr);
-		rt_init_metrics(rt, fl4, fi);
+		if (unlikely(fnhe))
+			rt_bind_exception(rt, fnhe, daddr);
+		rt_init_metrics(rt, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+		rt->dst.tclassid = nh->nh_tclassid;
 #endif
+		if (!(rt->dst.flags & DST_HOST) &&
+		    rt_is_output_route(rt))
+			rt_cache_route(nh, rt);
 	}
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1236,10 +1271,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 }
 
 static struct rtable *rt_dst_alloc(struct net_device *dev,
-				   bool nopolicy, bool noxfrm)
+				   bool nopolicy, bool noxfrm, bool will_cache)
 {
 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
-			 DST_HOST | DST_NOCACHE |
+			 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
 			 (nopolicy ? DST_NOPOLICY : 0) |
 			 (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1276,7 +1311,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			goto e_err;
 	}
 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1349,6 +1384,7 @@ static int __mkroute_input(struct sk_buff *skb,
 			   __be32 daddr, __be32 saddr, u32 tos,
 			   struct rtable **result)
 {
+	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
@@ -1395,9 +1431,13 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
+	fnhe = NULL;
+	if (res->fi)
+		fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+
 	rth = rt_dst_alloc(out_dev->dev,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(out_dev, NOXFRM));
+			   IN_DEV_CONF_GET(out_dev, NOXFRM), false);
 	if (!rth) {
 		err = -ENOBUFS;
 		goto cleanup;
@@ -1416,7 +1456,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
+	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
 
 	*result = rth;
 	err = 0;
@@ -1558,7 +1598,7 @@ brd_input:
 
 local_input:
 	rth = rt_dst_alloc(net->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1672,6 +1712,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 				       unsigned int flags)
 {
 	struct fib_info *fi = res->fi;
+	struct fib_nh_exception *fnhe;
 	struct in_device *in_dev;
 	u16 type = res->type;
 	struct rtable *rth;
@@ -1710,9 +1751,22 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 			fi = NULL;
 	}
 
+	fnhe = NULL;
+	if (fi) {
+		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+		if (!fnhe) {
+			rth = FIB_RES_NH(*res).nh_rth_output;
+			if (rth &&
+			    rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) {
+				dst_use(&rth->dst, jiffies);
+				return rth;
+			}
+		}
+	}
 	rth = rt_dst_alloc(dev_out,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(in_dev, NOXFRM));
+			   IN_DEV_CONF_GET(in_dev, NOXFRM),
+			   fi && !fnhe);
 	if (!rth)
 		return ERR_PTR(-ENOBUFS);
 
@@ -1749,7 +1803,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 #endif
 	}
 
-	rt_set_nexthop(rth, fl4, res, fi, type, 0);
+	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
 
 	if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
 		rth->dst.flags |= DST_NOCACHE;
-- 
cgit v1.2.3


From d2d68ba9fe8b38eb03124b3176a013bb8aa2b5e5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 12:58:50 -0700
Subject: ipv4: Cache input routes in fib_info nexthops.

Caching input routes is slightly simpler than output routes, since we
don't need to be concerned with nexthop exceptions.  (locally
destined, and routed packets, never trigger PMTU events or redirects
that will be processed by us).

However, we have to elide caching for the DIRECTSRC and non-zero itag
cases.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  1 +
 net/ipv4/fib_semantics.c |  2 ++
 net/ipv4/route.c         | 55 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 46 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index fb62c590360e..e69c3a47153d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -82,6 +82,7 @@ struct fib_nh {
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable		*nh_rth_output;
+	struct rtable		*nh_rth_input;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 83d0f42b619a..e55171f184f9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -173,6 +173,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			free_nh_exceptions(nexthop_nh);
 		if (nexthop_nh->nh_rth_output)
 			dst_release(&nexthop_nh->nh_rth_output->dst);
+		if (nexthop_nh->nh_rth_input)
+			dst_release(&nexthop_nh->nh_rth_input->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8a0260010ea1..97cca8a03d94 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1231,6 +1231,9 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
 	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
 
+	if (rt_is_input_route(rt))
+		p = &nh->nh_rth_input;
+
 	orig = *p;
 
 	prev = cmpxchg(p, orig, rt);
@@ -1241,6 +1244,11 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	}
 }
 
+static bool rt_cache_valid(struct rtable *rt)
+{
+	return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
+}
+
 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			   const struct fib_result *res,
 			   struct fib_nh_exception *fnhe,
@@ -1257,8 +1265,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
-		if (!(rt->dst.flags & DST_HOST) &&
-		    rt_is_output_route(rt))
+		if (!(rt->dst.flags & DST_HOST))
 			rt_cache_route(nh, rt);
 	}
 
@@ -1384,11 +1391,11 @@ static int __mkroute_input(struct sk_buff *skb,
 			   __be32 daddr, __be32 saddr, u32 tos,
 			   struct rtable **result)
 {
-	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
 	unsigned int flags = 0;
+	bool do_cache;
 	u32 itag;
 
 	/* get a working reference to the output device */
@@ -1431,13 +1438,21 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	fnhe = NULL;
-	if (res->fi)
-		fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+	do_cache = false;
+	if (res->fi) {
+		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+			rth = FIB_RES_NH(*res).nh_rth_input;
+			if (rt_cache_valid(rth)) {
+				dst_use(&rth->dst, jiffies);
+				goto out;
+			}
+			do_cache = true;
+		}
+	}
 
 	rth = rt_dst_alloc(out_dev->dev,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
-			   IN_DEV_CONF_GET(out_dev, NOXFRM), false);
+			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
 	if (!rth) {
 		err = -ENOBUFS;
 		goto cleanup;
@@ -1456,8 +1471,8 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
-
+	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+out:
 	*result = rth;
 	err = 0;
  cleanup:
@@ -1509,6 +1524,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	struct rtable	*rth;
 	int		err = -EINVAL;
 	struct net    *net = dev_net(dev);
+	bool do_cache;
 
 	/* IP on this device is disabled. */
 
@@ -1522,6 +1538,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 		goto martian_source;
 
+	res.fi = NULL;
 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
 		goto brd_input;
 
@@ -1597,8 +1614,20 @@ brd_input:
 	RT_CACHE_STAT_INC(in_brd);
 
 local_input:
+	do_cache = false;
+	if (res.fi) {
+		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+			rth = FIB_RES_NH(res).nh_rth_input;
+			if (rt_cache_valid(rth)) {
+				dst_use(&rth->dst, jiffies);
+				goto set_and_out;
+			}
+			do_cache = true;
+		}
+	}
+
 	rth = rt_dst_alloc(net->loopback_dev,
-			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
 	if (!rth)
 		goto e_nobufs;
 
@@ -1622,6 +1651,9 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+	if (do_cache)
+		rt_cache_route(&FIB_RES_NH(res), rth);
+set_and_out:
 	skb_dst_set(skb, &rth->dst);
 	err = 0;
 	goto out;
@@ -1756,8 +1788,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
 		if (!fnhe) {
 			rth = FIB_RES_NH(*res).nh_rth_output;
-			if (rth &&
-			    rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) {
+			if (rt_cache_valid(rth)) {
 				dst_use(&rth->dst, jiffies);
 				return rth;
 			}
-- 
cgit v1.2.3


From ba3f7f04ef2b19aace38f855aedd17fe43035d50 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 14:02:46 -0700
Subject: ipv4: Kill FLOWI_FLAG_RT_NOCACHE and associated code.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h                 | 1 -
 include/net/inet_connection_sock.h | 3 +--
 net/dccp/ipv4.c                    | 2 +-
 net/ipv4/inet_connection_sock.c    | 5 +----
 net/ipv4/route.c                   | 3 ---
 net/ipv4/tcp_ipv4.c                | 4 ++--
 6 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/flow.h b/include/net/flow.h
index ce9cb7656b47..e1dd5082ec7e 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -21,7 +21,6 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_CAN_SLEEP		0x02
-#define FLOWI_FLAG_RT_NOCACHE		0x04
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2cf44b4ed2e6..5ee66f517b4f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 extern struct dst_entry* inet_csk_route_req(struct sock *sk,
 					    struct flowi4 *fl4,
-					    const struct request_sock *req,
-					    bool nocache);
+					    const struct request_sock *req);
 extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk,
 						   struct sock *newsk,
 						   const struct request_sock *req);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ab4f44c9bb21..25428d0c50c9 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -508,7 +508,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = inet_csk_route_req(sk, &fl4, req, false);
+	dst = inet_csk_route_req(sk, &fl4, req);
 	if (dst == NULL)
 		goto out;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0a290d719bc7..db0cf17c00f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 
 struct dst_entry *inet_csk_route_req(struct sock *sk,
 				     struct flowi4 *fl4,
-				     const struct request_sock *req,
-				     bool nocache)
+				     const struct request_sock *req)
 {
 	struct rtable *rt;
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	struct net *net = sock_net(sk);
 	int flags = inet_sk_flowi_flags(sk);
 
-	if (nocache)
-		flags |= FLOWI_FLAG_RT_NOCACHE;
 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97cca8a03d94..7e1c0ed0ef70 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1836,9 +1836,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
 
-	if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
-		rth->dst.flags |= DST_NOCACHE;
-
 	return rth;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1d8b75a58981..59110caeb074 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -824,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	struct sk_buff * skb;
 
 	/* First, grab a route. */
-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 		return -1;
 
 	skb = tcp_make_synack(sk, dst, req, rvp);
@@ -1378,7 +1378,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
+		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
 		    fl4.daddr == saddr) {
 			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
-- 
cgit v1.2.3


From 93ac53410a82a4f1bf2baf9d65d95cc29f2774ca Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 14:09:39 -0700
Subject: ipv4: Dirty less cache lines in route caching paths.

Don't bother incrementing dst->__use and setting dst->lastuse,
they are completely pointless and just slow things down.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7e1c0ed0ef70..b8707779b85d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1443,7 +1443,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		if (!(flags & RTCF_DIRECTSRC) && !itag) {
 			rth = FIB_RES_NH(*res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				goto out;
 			}
 			do_cache = true;
@@ -1619,7 +1619,7 @@ local_input:
 		if (!(flags & RTCF_DIRECTSRC) && !itag) {
 			rth = FIB_RES_NH(res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				goto set_and_out;
 			}
 			do_cache = true;
@@ -1789,7 +1789,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		if (!fnhe) {
 			rth = FIB_RES_NH(*res).nh_rth_output;
 			if (rt_cache_valid(rth)) {
-				dst_use(&rth->dst, jiffies);
+				dst_hold(&rth->dst);
 				return rth;
 			}
 		}
-- 
cgit v1.2.3


From 4fd551d7bed93af60af61c5a324b8f5dff37953a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 14:39:44 -0700
Subject: ipv4: Kill rt->rt_oif

Never actually used.

It was being set on output routes to the original OIF specified in the
flow key used for the lookup.

Adjust the only user, ipmr_rt_fib_lookup(), for greater correctness of
the flowi4_oif and flowi4_iif values, thanks to feedback from Julian
Anastasov.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     | 1 -
 net/ipv4/ipmr.c         | 7 +++++--
 net/ipv4/route.c        | 5 -----
 net/ipv4/xfrm4_policy.c | 1 -
 4 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 3c1eeab9749b..e789a92fd602 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -50,7 +50,6 @@ struct rtable {
 
 	int			rt_route_iif;
 	int			rt_iif;
-	int			rt_oif;
 
 	/* Info on neighbour */
 	__be32			rt_gateway;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index eee3bf6676fe..8eec8f4a0536 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1795,8 +1795,11 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
 		.daddr = iph->daddr,
 		.saddr = iph->saddr,
 		.flowi4_tos = RT_TOS(iph->tos),
-		.flowi4_oif = rt->rt_oif,
-		.flowi4_iif = rt->rt_iif,
+		.flowi4_oif = (rt_is_output_route(rt) ?
+			       skb->dev->ifindex : 0),
+		.flowi4_iif = (rt_is_output_route(rt) ?
+			       net->loopback_dev->ifindex :
+			       skb->dev->ifindex),
 		.flowi4_mark = skb->mark,
 	};
 	struct mr_table *mrt;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b8707779b85d..a280b6ac8eb2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1332,7 +1332,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
-	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1463,7 +1462,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_type = res->type;
 	rth->rt_route_iif = in_dev->dev->ifindex;
 	rth->rt_iif 	= in_dev->dev->ifindex;
-	rth->rt_oif 	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1642,7 +1640,6 @@ local_input:
 	rth->rt_type	= res.type;
 	rth->rt_route_iif = dev->ifindex;
 	rth->rt_iif	= dev->ifindex;
-	rth->rt_oif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	rth->fi = NULL;
@@ -1808,7 +1805,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_type	= type;
 	rth->rt_route_iif = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
-	rth->rt_oif	= orig_oif;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
 	rth->fi = NULL;
@@ -2085,7 +2081,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 
 		rt->rt_route_iif = ort->rt_route_iif;
 		rt->rt_iif = ort->rt_iif;
-		rt->rt_oif = ort->rt_oif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
 		rt->rt_genid = rt_genid(net);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6074b694f118..3c99b4cdd290 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -81,7 +81,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
-	xdst->u.rt.rt_oif = fl4->flowi4_oif;
 
 	xdst->u.dst.dev = dev;
 	dev_hold(dev);
-- 
cgit v1.2.3


From 9917e1e8762745191eba5a3bf2040278cbddbee1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 14:44:26 -0700
Subject: ipv4: Turn rt->rt_route_iif into rt->rt_is_input.

That is this value's only use, as a boolean to indicate whether
a route is an input route or not.

So implement it that way, using a u16 gap present in the struct
already.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |  6 +++---
 net/ipv4/route.c        | 10 +++++-----
 net/ipv4/xfrm4_policy.c |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index e789a92fd602..4bafe0bfe829 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -47,8 +47,8 @@ struct rtable {
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
+	__u16			rt_is_input;
 
-	int			rt_route_iif;
 	int			rt_iif;
 
 	/* Info on neighbour */
@@ -61,12 +61,12 @@ struct rtable {
 
 static inline bool rt_is_input_route(const struct rtable *rt)
 {
-	return rt->rt_route_iif != 0;
+	return rt->rt_is_input != 0;
 }
 
 static inline bool rt_is_output_route(const struct rtable *rt)
 {
-	return rt->rt_route_iif == 0;
+	return rt->rt_is_input == 0;
 }
 
 static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a280b6ac8eb2..fac4c4acdbac 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1330,7 +1330,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_genid	= rt_genid(dev_net(dev));
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
-	rth->rt_route_iif = dev->ifindex;
+	rth->rt_is_input= 1;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1460,7 +1460,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
-	rth->rt_route_iif = in_dev->dev->ifindex;
+	rth->rt_is_input = 1;
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1638,7 +1638,7 @@ local_input:
 	rth->rt_genid = rt_genid(net);
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	rth->rt_route_iif = dev->ifindex;
+	rth->rt_is_input = 1;
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
@@ -1803,7 +1803,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
-	rth->rt_route_iif = 0;
+	rth->rt_is_input = 0;
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
@@ -2079,7 +2079,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		if (new->dev)
 			dev_hold(new->dev);
 
-		rt->rt_route_iif = ort->rt_route_iif;
+		rt->rt_is_input = ort->rt_is_input;
 		rt->rt_iif = ort->rt_iif;
 		rt->rt_pmtu = ort->rt_pmtu;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3c99b4cdd290..c6281847f16a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,7 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	struct rtable *rt = (struct rtable *)xdst->route;
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
-	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
 
 	xdst->u.dst.dev = dev;
@@ -87,6 +86,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 
 	/* Sheit... I remember I did this right. Apparently,
 	 * it was magically lost, so this code needs audit */
+	xdst->u.rt.rt_is_input = rt->rt_is_input;
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
-- 
cgit v1.2.3


From 2860583fe840d972573363dfa190b2149a604534 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Jul 2012 14:55:59 -0700
Subject: ipv4: Kill rt->fi

It's not really needed.

We only grabbed a reference to the fib_info for the sake of fib_info
local metrics.

However, fib_info objects are freed using RCU, as are therefore their
private metrics (if any).

We would have triggered a route cache flush if we eliminated a
reference to a fib_info object in the routing tables.

Therefore, any existing cached routes will first check and see that
they have been invalidated before an errant reference to these
metric values would occur.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  1 -
 net/ipv4/route.c    | 32 +-------------------------------
 2 files changed, 1 insertion(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 4bafe0bfe829..60d611dc5cee 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -56,7 +56,6 @@ struct rtable {
 
 	/* Miscellaneous cached information */
 	u32			rt_pmtu;
-	struct fib_info		*fi; /* for client ref to shared metrics */
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fac4c4acdbac..9add08869c75 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -141,7 +141,6 @@ static int ip_rt_min_advmss __read_mostly	= 256;
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
-static void		 ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -171,7 +170,6 @@ static struct dst_ops ipv4_dst_ops = {
 	.default_advmss =	ipv4_default_advmss,
 	.mtu =			ipv4_mtu,
 	.cow_metrics =		ipv4_cow_metrics,
-	.destroy =		ipv4_dst_destroy,
 	.ifdown =		ipv4_dst_ifdown,
 	.negative_advice =	ipv4_negative_advice,
 	.link_failure =		ipv4_link_failure,
@@ -1034,17 +1032,6 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 	return dst;
 }
 
-static void ipv4_dst_destroy(struct dst_entry *dst)
-{
-	struct rtable *rt = (struct rtable *) dst;
-
-	if (rt->fi) {
-		fib_info_put(rt->fi);
-		rt->fi = NULL;
-	}
-}
-
-
 static void ipv4_link_failure(struct sk_buff *skb)
 {
 	struct rtable *rt;
@@ -1158,15 +1145,6 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	return mtu;
 }
 
-static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
-{
-	if (fi->fib_metrics != (u32 *) dst_default_metrics) {
-		rt->fi = fi;
-		atomic_inc(&fi->fib_clntref);
-	}
-	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
-}
-
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -1261,7 +1239,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			rt->rt_gateway = nh->nh_gw;
 		if (unlikely(fnhe))
 			rt_bind_exception(rt, fnhe, daddr);
-		rt_init_metrics(rt, fi);
+		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
@@ -1334,7 +1312,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
-	rth->fi = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1464,7 +1441,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= in_dev->dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
-	rth->fi = NULL;
 
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
@@ -1642,7 +1618,6 @@ local_input:
 	rth->rt_iif	= dev->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
-	rth->fi = NULL;
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
 		rth->dst.error= -err;
@@ -1807,7 +1782,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
-	rth->fi = NULL;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
@@ -2052,7 +2026,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
 static struct dst_ops ipv4_dst_blackhole_ops = {
 	.family			=	AF_INET,
 	.protocol		=	cpu_to_be16(ETH_P_IP),
-	.destroy		=	ipv4_dst_destroy,
 	.check			=	ipv4_blackhole_dst_check,
 	.mtu			=	ipv4_blackhole_mtu,
 	.default_advmss		=	ipv4_default_advmss,
@@ -2087,9 +2060,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_gateway = ort->rt_gateway;
-		rt->fi = ort->fi;
-		if (rt->fi)
-			atomic_inc(&rt->fi->fib_clntref);
 
 		dst_free(new);
 	}
-- 
cgit v1.2.3


From 0980e56e506b1e45022fad00bca8c8a974fda4e6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 20 Jul 2012 22:28:51 +0000
Subject: ipv4: tcp: set unicast_sock uc_ttl to -1

Set unicast_sock uc_ttl to -1 so that we select the right ttl,
instead of sending packets with a 0 ttl.

Bug added in commit be9f4a44e7d4 (ipv4: tcp: remove per net tcp_sock)

Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c528f841ca4b..665abbb7122a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1476,7 +1476,8 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
 		.sk_allocation	= GFP_ATOMIC,
 		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
 	},
-	.pmtudisc = IP_PMTUDISC_WANT,
+	.pmtudisc	= IP_PMTUDISC_WANT,
+	.uc_ttl		= -1,
 };
 
 void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
-- 
cgit v1.2.3


From 9a0a9502cbf19d31f7387e3066f3d1a491bef616 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 23 Jul 2012 10:46:38 +0300
Subject: tcp: avoid oops in tcp_metrics and reset tcpm_stamp

	In tcp_tw_remember_stamp we incorrectly checked tw
instead of tm, it can lead to oops if the cached entry is
not found.

	tcpm_stamp was not updated in tcpm_check_stamp when
tcpm_suck_dst was called, move the update into tcpm_suck_dst,
so that we do not call it infinitely on every next cache hit
after TCP_METRICS_TIMEOUT.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 992f1bff4fc6..2288a6399e1e 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -107,6 +107,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 {
 	u32 val;
 
+	tm->tcpm_stamp = jiffies;
+
 	val = 0;
 	if (dst_metric_locked(dst, RTAX_RTT))
 		val |= 1 << TCP_METRIC_RTT;
@@ -158,7 +160,6 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
 			goto out_unlock;
 	}
 	tm->tcpm_addr = *addr;
-	tm->tcpm_stamp = jiffies;
 
 	tcpm_suck_dst(tm, dst);
 
@@ -621,7 +622,7 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 
 	rcu_read_lock();
 	tm = __tcp_get_metrics_tw(tw);
-	if (tw) {
+	if (tm) {
 		const struct tcp_timewait_sock *tcptw;
 		struct sock *sk = (struct sock *) tw;
 
-- 
cgit v1.2.3


From 563d34d05786263893ba4a1042eb9b9374127cf5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2012 09:48:52 +0200
Subject: tcp: dont drop MTU reduction indications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ICMP messages generated in output path if frame length is bigger than
mtu are actually lost because socket is owned by user (doing the xmit)

One example is the ipgre_tunnel_xmit() calling
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));

We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on
sockets hitting dst_allfrag).

Problem of such fix is that it relied on retransmit timers, so short tcp
sessions paid a too big latency increase price.

This patch uses the tcp_release_cb() infrastructure so that MTU
reduction messages (ICMP messages) are not lost, and no extra delay
is added in TCP transmits.

Reported-by: Maciej Żenczykowski <maze@google.com>
Diagnosed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Tore Anderson <tore@fud.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  6 ++++++
 include/net/sock.h    |  1 +
 net/ipv4/tcp_ipv4.c   | 19 +++++++++++++++----
 net/ipv4/tcp_output.c |  6 +++++-
 net/ipv6/tcp_ipv6.c   | 40 ++++++++++++++++++++++++----------------
 5 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2761856987b2..eb125a4c30b3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -493,6 +493,9 @@ struct tcp_sock {
 		u32		  probe_seq_start;
 		u32		  probe_seq_end;
 	} mtu_probe;
+	u32	mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
+			   * while socket was owned by user.
+			   */
 
 #ifdef CONFIG_TCP_MD5SIG
 /* TCP AF-Specific parts; only used by MD5 Signature support so far */
@@ -518,6 +521,9 @@ enum tsq_flags {
 	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
 	TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
 	TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
+	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
+				    * tcp_v{4|6}_mtu_reduced()
+				    */
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/sock.h b/include/net/sock.h
index 88de092df50f..e067f8c18f88 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -859,6 +859,7 @@ struct proto {
 						struct sk_buff *skb);
 
 	void		(*release_cb)(struct sock *sk);
+	void		(*mtu_reduced)(struct sock *sk);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 59110caeb074..bc5432e3c778 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -275,12 +275,15 @@ failure:
 EXPORT_SYMBOL(tcp_v4_connect);
 
 /*
- * This routine does path mtu discovery as defined in RFC1191.
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
  */
-static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
+static void tcp_v4_mtu_reduced(struct sock *sk)
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
+	u32 mtu = tcp_sk(sk)->mtu_info;
 
 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 	 * send out by Linux are always <576bytes so they should go through
@@ -373,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
 	 * servers this needs to be solved differently.
+	 * We do take care of PMTU discovery (RFC1191) special case :
+	 * we can receive locally generated ICMP messages while socket is held.
 	 */
-	if (sock_owned_by_user(sk))
+	if (sock_owned_by_user(sk) &&
+	    type != ICMP_DEST_UNREACH &&
+	    code != ICMP_FRAG_NEEDED)
 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 
 	if (sk->sk_state == TCP_CLOSE)
@@ -409,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 			goto out;
 
 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			tp->mtu_info = info;
 			if (!sock_owned_by_user(sk))
-				do_pmtu_discovery(sk, iph, info);
+				tcp_v4_mtu_reduced(sk);
+			else
+				set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
 			goto out;
 		}
 
@@ -2596,6 +2606,7 @@ struct proto tcp_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
+	.mtu_reduced		= tcp_v4_mtu_reduced,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 950aebfd9967..33cd065cfbd8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -885,7 +885,8 @@ static void tcp_tasklet_func(unsigned long data)
 
 #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
 			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
-			  (1UL << TCP_DELACK_TIMER_DEFERRED))
+			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
+			  (1UL << TCP_MTU_REDUCED_DEFERRED))
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -914,6 +915,9 @@ void tcp_release_cb(struct sock *sk)
 
 	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
 		tcp_delack_timer_handler(sk);
+
+	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
+		sk->sk_prot->mtu_reduced(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0302ec3fecfc..f49476e2d884 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -315,6 +315,23 @@ failure:
 	return err;
 }
 
+static void tcp_v6_mtu_reduced(struct sock *sk)
+{
+	struct dst_entry *dst;
+
+	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+		return;
+
+	dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+	if (!dst)
+		return;
+
+	if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+		tcp_sync_mss(sk, dst_mtu(dst));
+		tcp_simple_retransmit(sk);
+	}
+}
+
 static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		u8 type, u8 code, int offset, __be32 info)
 {
@@ -342,7 +359,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk))
+	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 
 	if (sk->sk_state == TCP_CLOSE)
@@ -371,21 +388,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
-		struct dst_entry *dst;
-
-		if (sock_owned_by_user(sk))
-			goto out;
-		if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
-			goto out;
-
-		dst = inet6_csk_update_pmtu(sk, ntohl(info));
-		if (!dst)
-			goto out;
-
-		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
-			tcp_sync_mss(sk, dst_mtu(dst));
-			tcp_simple_retransmit(sk);
-		}
+		tp->mtu_info = ntohl(info);
+		if (!sock_owned_by_user(sk))
+			tcp_v6_mtu_reduced(sk);
+		else
+			set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
 		goto out;
 	}
 
@@ -1949,6 +1956,7 @@ struct proto tcpv6_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
+	.mtu_reduced		= tcp_v6_mtu_reduced,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
-- 
cgit v1.2.3


From 8fe5cb873b7ef7f4fa49477455e8f2e3d555230e Mon Sep 17 00:00:00 2001
From: Lin Ming <mlin@ss.pku.edu.cn>
Date: Mon, 23 Jul 2012 04:11:21 +0000
Subject: ipv4: Remove redundant assignment

It is redundant to set no_addr and accept_local to 0 and then set them
with other values just after that.

Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f277cf0e6321..8732cc7920ed 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -258,7 +258,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
-	no_addr = accept_local = 0;
 	no_addr = idev->ifa_list == NULL;
 
 	accept_local = IN_DEV_ACCEPT_LOCAL(idev);
-- 
cgit v1.2.3


From e7d4b18cbebc635fafd634688bbf66c59912879f Mon Sep 17 00:00:00 2001
From: Saurabh <saurabh.mohan@vyatta.com>
Date: Mon, 23 Jul 2012 07:52:04 +0000
Subject: net/ipv4/ip_vti.c: Fix __rcu warnings detected by sparse.

With CONFIG_SPARSE_RCU_POINTER=y sparse identified references which did not
specificy __rcu in ip_vti.c

Signed-off-by: Saurabh Mohan <saurabh.mohan@vyatta.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index c41b5c359936..3511ffba7bd4 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -55,7 +55,7 @@ struct vti_net {
 	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
 	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
 	struct ip_tunnel __rcu *tunnels_wc[1];
-	struct ip_tunnel **tunnels[4];
+	struct ip_tunnel __rcu **tunnels[4];
 
 	struct net_device *fb_tunnel_dev;
 };
@@ -160,8 +160,8 @@ static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
 	return NULL;
 }
 
-static struct ip_tunnel **__vti_bucket(struct vti_net *ipn,
-				       struct ip_tunnel_parm *parms)
+static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
+					     struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
@@ -179,8 +179,8 @@ static struct ip_tunnel **__vti_bucket(struct vti_net *ipn,
 	return &ipn->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **vti_bucket(struct vti_net *ipn,
-					    struct ip_tunnel *t)
+static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
+						  struct ip_tunnel *t)
 {
 	return __vti_bucket(ipn, &t->parms);
 }
-- 
cgit v1.2.3


From 838942a594017817d33b2d914152305054e255af Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Jul 2012 13:20:26 -0700
Subject: ipv4: Really ignore ICMP address requests/replies.

Alexey removed kernel side support for requests, and the
only thing we do for replies is log a message if something
doesn't look right.

As Alexey's comment indicates, this belongs in userspace (if
anywhere), and thus we can safely just get rid of this code.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 84 ++-------------------------------------------------------
 1 file changed, 2 insertions(+), 82 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index ea3a996de95b..f2a06beffbd3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -837,86 +837,6 @@ out_err:
 	goto out;
 }
 
-
-/*
- *	Handle ICMP_ADDRESS_MASK requests.  (RFC950)
- *
- * RFC1122 (3.2.2.9).  A host MUST only send replies to
- * ADDRESS_MASK requests if it's been configured as an address mask
- * agent.  Receiving a request doesn't constitute implicit permission to
- * act as one. Of course, implementing this correctly requires (SHOULD)
- * a way to turn the functionality on and off.  Another one for sysctl(),
- * I guess. -- MS
- *
- * RFC1812 (4.3.3.9).	A router MUST implement it.
- *			A router SHOULD have switch turning it on/off.
- *		      	This switch MUST be ON by default.
- *
- * Gratuitous replies, zero-source replies are not implemented,
- * that complies with RFC. DO NOT implement them!!! All the idea
- * of broadcast addrmask replies as specified in RFC950 is broken.
- * The problem is that it is not uncommon to have several prefixes
- * on one physical interface. Moreover, addrmask agent can even be
- * not aware of existing another prefixes.
- * If source is zero, addrmask agent cannot choose correct prefix.
- * Gratuitous mask announcements suffer from the same problem.
- * RFC1812 explains it, but still allows to use ADDRMASK,
- * that is pretty silly. --ANK
- *
- * All these rules are so bizarre, that I removed kernel addrmask
- * support at all. It is wrong, it is obsolete, nobody uses it in
- * any case. --ANK
- *
- * Furthermore you can do it with a usermode address agent program
- * anyway...
- */
-
-static void icmp_address(struct sk_buff *skb)
-{
-#if 0
-	net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
-#endif
-}
-
-/*
- * RFC1812 (4.3.3.9).	A router SHOULD listen all replies, and complain
- *			loudly if an inconsistency is found.
- * called with rcu_read_lock()
- */
-
-static void icmp_address_reply(struct sk_buff *skb)
-{
-	struct rtable *rt = skb_rtable(skb);
-	struct net_device *dev = skb->dev;
-	struct in_device *in_dev;
-	struct in_ifaddr *ifa;
-
-	if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
-		return;
-
-	in_dev = __in_dev_get_rcu(dev);
-	if (!in_dev)
-		return;
-
-	if (in_dev->ifa_list &&
-	    IN_DEV_LOG_MARTIANS(in_dev) &&
-	    IN_DEV_FORWARD(in_dev)) {
-		__be32 _mask, *mp;
-
-		mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
-		BUG_ON(mp == NULL);
-		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-			if (*mp == ifa->ifa_mask &&
-			    inet_ifa_match(ip_hdr(skb)->saddr, ifa))
-				break;
-		}
-		if (!ifa)
-			net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
-					     mp,
-					     dev->name, &ip_hdr(skb)->saddr);
-	}
-}
-
 static void icmp_discard(struct sk_buff *skb)
 {
 }
@@ -1080,10 +1000,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
 		.handler = icmp_discard,
 	},
 	[ICMP_ADDRESS] = {
-		.handler = icmp_address,
+		.handler = icmp_discard,
 	},
 	[ICMP_ADDRESSREPLY] = {
-		.handler = icmp_address_reply,
+		.handler = icmp_discard,
 	},
 };
 
-- 
cgit v1.2.3


From fe3edf45792a7d2f0edff4e2fcdd9a84c1a388a0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Jul 2012 13:22:20 -0700
Subject: ipv4: Remove all RTCF_DIRECTSRC handliing.

The last and final kernel user, ICMP address replies,
has been removed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9add08869c75..34017be87c85 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1391,9 +1391,6 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	if (err)
-		flags |= RTCF_DIRECTSRC;
-
 	if (out_dev == in_dev && err &&
 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
@@ -1416,7 +1413,7 @@ static int __mkroute_input(struct sk_buff *skb,
 
 	do_cache = false;
 	if (res->fi) {
-		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+		if (!itag) {
 			rth = FIB_RES_NH(*res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
 				dst_hold(&rth->dst);
@@ -1558,8 +1555,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 					  dev, in_dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
-		if (err)
-			flags |= RTCF_DIRECTSRC;
 		goto local_input;
 	}
 
@@ -1580,8 +1575,6 @@ brd_input:
 					  in_dev, &itag);
 		if (err < 0)
 			goto martian_source_keep_err;
-		if (err)
-			flags |= RTCF_DIRECTSRC;
 	}
 	flags |= RTCF_BROADCAST;
 	res.type = RTN_BROADCAST;
@@ -1590,7 +1583,7 @@ brd_input:
 local_input:
 	do_cache = false;
 	if (res.fi) {
-		if (!(flags & RTCF_DIRECTSRC) && !itag) {
+		if (!itag) {
 			rth = FIB_RES_NH(res).nh_rth_input;
 			if (rt_cache_valid(rth)) {
 				dst_hold(&rth->dst);
-- 
cgit v1.2.3


From 92101b3b2e3178087127709a556b091dae314e9e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Jul 2012 16:29:00 -0700
Subject: ipv4: Prepare for change of rt->rt_iif encoding.

Use inet_iif() consistently, and for TCP record the input interface of
cached RX dst in inet sock.

rt->rt_iif is going to be encoded differently, so that we can
legitimately cache input routes in the FIB info more aggressively.

When the input interface is "use SKB device index" the rt->rt_iif will
be set to zero.

This forces us to move the TCP RX dst cache installation into the ipv4
specific code, and as well it should since doing the route caching for
ipv6 is pointless at the moment since it is not inspected in the ipv6
input paths yet.

Also, remove the unlikely on dst->obsolete, all ipv4 dsts have
obsolete set to a non-zero value to force invocation of the check
callback.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h |  1 +
 net/dccp/ipv4.c         |  2 +-
 net/ipv4/icmp.c         |  2 +-
 net/ipv4/ip_sockglue.c  |  5 ++---
 net/ipv4/route.c        |  2 +-
 net/ipv4/tcp_input.c    | 12 ------------
 net/ipv4/tcp_ipv4.c     | 24 ++++++++++++++++++------
 net/sched/cls_route.c   |  2 +-
 net/sched/em_meta.c     |  2 +-
 net/sctp/protocol.c     |  2 +-
 10 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 924d7b98ab60..613cfa401672 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -172,6 +172,7 @@ struct inet_sock {
 	int			uc_index;
 	int			mc_index;
 	__be32			mc_addr;
+	int			rx_dst_ifindex;
 	struct ip_mc_socklist __rcu	*mc_list;
 	struct inet_cork_full	cork;
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 25428d0c50c9..176ecdba4a22 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -481,7 +481,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
 	struct rtable *rt;
 	const struct iphdr *iph = ip_hdr(skb);
 	struct flowi4 fl4 = {
-		.flowi4_oif = skb_rtable(skb)->rt_iif,
+		.flowi4_oif = inet_iif(skb),
 		.daddr = iph->saddr,
 		.saddr = iph->daddr,
 		.flowi4_tos = RT_CONN_FLAGS(sk),
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f2a06beffbd3..f2eccd531746 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		rcu_read_lock();
 		if (rt_is_input_route(rt) &&
 		    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
-			dev = dev_get_by_index_rcu(net, rt->rt_iif);
+			dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
 
 		if (dev)
 			saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index de29f46f68b0..5eea4a811042 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1027,10 +1027,9 @@ e_inval:
 void ipv4_pktinfo_prepare(struct sk_buff *skb)
 {
 	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
-	const struct rtable *rt = skb_rtable(skb);
 
-	if (rt) {
-		pktinfo->ipi_ifindex = rt->rt_iif;
+	if (skb_rtable(skb)) {
+		pktinfo->ipi_ifindex = inet_iif(skb);
 		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
 	} else {
 		pktinfo->ipi_ifindex = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 34017be87c85..f6be78119396 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -848,7 +848,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		if (log_martians &&
 		    peer->rate_tokens == ip_rt_redirect_number)
 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
-					     &ip_hdr(skb)->saddr, rt->rt_iif,
+					     &ip_hdr(skb)->saddr, inet_iif(skb),
 					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
 #endif
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 21d7f8f3a7a5..3e07a64ca44e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5391,18 +5391,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (sk->sk_rx_dst) {
-		struct dst_entry *dst = sk->sk_rx_dst;
-		if (unlikely(dst->obsolete)) {
-			if (dst->ops->check(dst, 0) == NULL) {
-				dst_release(dst);
-				sk->sk_rx_dst = NULL;
-			}
-		}
-	}
-	if (unlikely(sk->sk_rx_dst == NULL))
-		sk->sk_rx_dst = dst_clone(skb_dst(skb));
-
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bc5432e3c778..3e30548ac32a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1618,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
 		sock_rps_save_rxhash(sk, skb);
+		if (sk->sk_rx_dst) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+		if (unlikely(sk->sk_rx_dst == NULL)) {
+			struct inet_sock *icsk = inet_sk(sk);
+			struct rtable *rt = skb_rtable(skb);
+
+			sk->sk_rx_dst = dst_clone(&rt->dst);
+			icsk->rx_dst_ifindex = inet_iif(skb);
+		}
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
 			goto reset;
@@ -1700,14 +1714,12 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 		skb->destructor = sock_edemux;
 		if (sk->sk_state != TCP_TIME_WAIT) {
 			struct dst_entry *dst = sk->sk_rx_dst;
+			struct inet_sock *icsk = inet_sk(sk);
 			if (dst)
 				dst = dst_check(dst, 0);
-			if (dst) {
-				struct rtable *rt = (struct rtable *) dst;
-
-				if (rt->rt_iif == dev->ifindex)
-					skb_dst_set_noref(skb, dst);
-			}
+			if (dst &&
+			    icsk->rx_dst_ifindex == dev->ifindex)
+				skb_dst_set_noref(skb, dst);
 		}
 	}
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 36fec4227401..44f405cb9aaf 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -143,7 +143,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	if (head == NULL)
 		goto old_method;
 
-	iif = ((struct rtable *)dst)->rt_iif;
+	iif = inet_iif(skb);
 
 	h = route4_fastmap_hash(id, iif);
 	if (id == head->fastmap[h].id &&
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 4790c696cbce..4ab6e3325573 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -264,7 +264,7 @@ META_COLLECTOR(int_rtiif)
 	if (unlikely(skb_rtable(skb) == NULL))
 		*err = -1;
 	else
-		dst->value = skb_rtable(skb)->rt_iif;
+		dst->value = inet_iif(skb);
 }
 
 /**************************************************************************
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9c90811d1134..1f89c4e69645 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -568,7 +568,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk,
 /* What interface did this skb arrive on? */
 static int sctp_v4_skb_iif(const struct sk_buff *skb)
 {
-	return skb_rtable(skb)->rt_iif;
+	return inet_iif(skb);
 }
 
 /* Was this packet marked by Explicit Congestion Notification? */
-- 
cgit v1.2.3


From 13378cad02afc2adc6c0e07fca03903c7ada0b37 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Jul 2012 13:57:45 -0700
Subject: ipv4: Change rt->rt_iif encoding.

On input packet processing, rt->rt_iif will be zero if we should
use skb->dev->ifindex.

Since we access rt->rt_iif consistently via inet_iif(), that is
the only spot whose interpretation have to adjust.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 6 +++++-
 net/ipv4/route.c    | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/route.h b/include/net/route.h
index 60d611dc5cee..c29ef2733f2d 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -277,7 +277,11 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
 
 static inline int inet_iif(const struct sk_buff *skb)
 {
-	return skb_rtable(skb)->rt_iif;
+	int iif = skb_rtable(skb)->rt_iif;
+
+	if (iif)
+		return iif;
+	return skb->skb_iif;
 }
 
 extern int sysctl_ip_default_ttl;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6be78119396..6bcb8fc71cbc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1309,7 +1309,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	rth->rt_is_input= 1;
-	rth->rt_iif	= dev->ifindex;
+	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	if (our) {
@@ -1435,7 +1435,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_flags = flags;
 	rth->rt_type = res->type;
 	rth->rt_is_input = 1;
-	rth->rt_iif 	= in_dev->dev->ifindex;
+	rth->rt_iif 	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 
@@ -1608,7 +1608,7 @@ local_input:
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
 	rth->rt_is_input = 1;
-	rth->rt_iif	= dev->ifindex;
+	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
 	if (res.type == RTN_UNREACHABLE) {
@@ -1772,7 +1772,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_flags	= flags;
 	rth->rt_type	= type;
 	rth->rt_is_input = 0;
-	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
+	rth->rt_iif	= orig_oif ? : 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
 
-- 
cgit v1.2.3