From 12d50c46dc0f7fd2e625c4befaa5fa5740a7a594 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Tue, 8 Dec 2009 22:26:13 +0000
Subject: tcp: Remove check in __tcp_push_pending_frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tcp_push checks tcp_send_head and calls __tcp_push_pending_frames,
which again checks tcp_send_head, and this unnecessary check is
done for every other caller of __tcp_push_pending_frames.

Remove tcp_send_head check in __tcp_push_pending_frames and add
the check to tcp_push_pending_frames. Other functions call
__tcp_push_pending_frames only when tcp_send_head would evaluate
to true.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 34f5cc24d903..185e22baecb1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -856,13 +856,6 @@ static inline void tcp_check_probe_timer(struct sock *sk)
 					  icsk->icsk_rto, TCP_RTO_MAX);
 }
 
-static inline void tcp_push_pending_frames(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
-}
-
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
 {
 	tp->snd_wl1 = seq;
@@ -1342,6 +1335,15 @@ static inline int tcp_write_queue_empty(struct sock *sk)
 	return skb_queue_empty(&sk->sk_write_queue);
 }
 
+static inline void tcp_push_pending_frames(struct sock *sk)
+{
+	if (tcp_send_head(sk)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		__tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
+	}
+}
+
 /* Start sequence of the highest skb with SACKed bit, valid only if
  * sacked > 0 or when the caller has ensured validity by itself.
  */
-- 
cgit v1.2.3


From 31d12926e37291970dd4f6e9940df3897766a81d Mon Sep 17 00:00:00 2001
From: laurent chavey <chavey@google.com>
Date: Tue, 15 Dec 2009 11:15:28 +0000
Subject: net: Add rtnetlink init_rcvwnd to set the TCP initial receive window

Add rtnetlink init_rcvwnd to set the TCP initial receive window size
advertised by passive and active TCP connections.
The current Linux TCP implementation limits the advertised TCP initial
receive window to the one prescribed by slow start. For short lived
TCP connections used for transaction type of traffic (i.e. http
requests), bounding the advertised TCP initial receive window results
in increased latency to complete the transaction.
Support for setting initial congestion window is already supported
using rtnetlink init_cwnd, but the feature is useless without the
ability to set a larger TCP initial receive window.
The rtnetlink init_rcvwnd allows increasing the TCP initial receive
window, allowing TCP connection to advertise larger TCP receive window
than the ones bounded by slow start.

Signed-off-by: Laurent Chavey <chavey@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  2 ++
 include/net/dst.h         |  2 --
 include/net/tcp.h         |  3 ++-
 net/ipv4/syncookies.c     |  3 ++-
 net/ipv4/tcp_output.c     | 17 +++++++++++++----
 net/ipv6/syncookies.c     |  3 ++-
 6 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 05330fc5b436..9590364fe8b5 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -362,6 +362,8 @@ enum {
 #define RTAX_FEATURES RTAX_FEATURES
 	RTAX_RTO_MIN,
 #define RTAX_RTO_MIN RTAX_RTO_MIN
+	RTAX_INITRWND,
+#define RTAX_INITRWND RTAX_INITRWND
 	__RTAX_MAX
 };
 
diff --git a/include/net/dst.h b/include/net/dst.h
index 39c4a5963e12..ce078cda6b74 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -83,8 +83,6 @@ struct dst_entry {
 	 * (L1_CACHE_SIZE would be too much)
 	 */
 #ifdef CONFIG_64BIT
-	long			__pad_to_align_refcnt[2];
-#else
 	long			__pad_to_align_refcnt[1];
 #endif
 	/*
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 185e22baecb1..788c99f98597 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -965,7 +965,8 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
 /* Determine a window scaling and initial window to offer. */
 extern void tcp_select_initial_window(int __space, __u32 mss,
 				      __u32 *rcv_wnd, __u32 *window_clamp,
-				      int wscale_ok, __u8 *rcv_wscale);
+				      int wscale_ok, __u8 *rcv_wscale,
+				      __u32 init_rcv_wnd);
 
 static inline int tcp_win_from_space(int space)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 66fd80ef2473..5c24db4a3c91 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -358,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
-				  ireq->wscale_ok, &rcv_wscale);
+				  ireq->wscale_ok, &rcv_wscale,
+				  dst_metric(&rt->u.dst, RTAX_INITRWND));
 
 	ireq->rcv_wscale  = rcv_wscale;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 12b2af36eab8..4a1605d3f909 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -183,7 +183,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
  */
 void tcp_select_initial_window(int __space, __u32 mss,
 			       __u32 *rcv_wnd, __u32 *window_clamp,
-			       int wscale_ok, __u8 *rcv_wscale)
+			       int wscale_ok, __u8 *rcv_wscale,
+			       __u32 init_rcv_wnd)
 {
 	unsigned int space = (__space < 0 ? 0 : __space);
 
@@ -232,7 +233,13 @@ void tcp_select_initial_window(int __space, __u32 mss,
 			init_cwnd = 2;
 		else if (mss > 1460)
 			init_cwnd = 3;
-		if (*rcv_wnd > init_cwnd * mss)
+		/* when initializing use the value from init_rcv_wnd
+		 * rather than the default from above
+		 */
+		if (init_rcv_wnd &&
+		    (*rcv_wnd > init_rcv_wnd * mss))
+			*rcv_wnd = init_rcv_wnd * mss;
+		else if (*rcv_wnd > init_cwnd * mss)
 			*rcv_wnd = init_cwnd * mss;
 	}
 
@@ -2417,7 +2424,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 			&req->rcv_wnd,
 			&req->window_clamp,
 			ireq->wscale_ok,
-			&rcv_wscale);
+			&rcv_wscale,
+			dst_metric(dst, RTAX_INITRWND));
 		ireq->rcv_wscale = rcv_wscale;
 	}
 
@@ -2544,7 +2552,8 @@ static void tcp_connect_init(struct sock *sk)
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
 				  sysctl_tcp_window_scaling,
-				  &rcv_wscale);
+				  &rcv_wscale,
+				  dst_metric(dst, RTAX_INITRWND));
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
 	tp->rcv_ssthresh = tp->rcv_wnd;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7208a06576c6..34d1f0690d7e 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -269,7 +269,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
-				  ireq->wscale_ok, &rcv_wscale);
+				  ireq->wscale_ok, &rcv_wscale,
+				  dst_metric(dst, RTAX_INITRWND));
 
 	ireq->rcv_wscale = rcv_wscale;
 
-- 
cgit v1.2.3


From 72659ecce68588b74f6c46862c2b4cec137d7a5a Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Sun, 17 Jan 2010 19:09:39 -0800
Subject: tcp: account SYN-ACK timeouts & retransmissions

Currently we don't increment SYN-ACK timeouts & retransmissions
although we do increment the same stats for SYN. We seem to have lost
the SYN-ACK accounting with the introduction of tcp_syn_recv_timer
(commit 2248761e in the netdev-vger-cvs tree).

This patch fixes this issue. In the process we also rename the v4/v6
syn/ack retransmit functions for clarity. We also add a new
request_socket operations (syn_ack_timeout) so we can keep code in
inet_connection_sock.c protocol agnostic.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h      |  2 ++
 include/net/tcp.h               |  2 ++
 net/ipv4/inet_connection_sock.c |  2 ++
 net/ipv4/tcp_ipv4.c             | 18 ++++++++++--------
 net/ipv4/tcp_timer.c            |  6 ++++++
 net/ipv6/tcp_ipv6.c             | 12 ++++++++++--
 6 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index c9b50ebd9ce9..99e6e19b57c2 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -45,6 +45,8 @@ struct request_sock_ops {
 	void		(*send_reset)(struct sock *sk,
 				      struct sk_buff *skb);
 	void		(*destructor)(struct request_sock *req);
+	void		(*syn_ack_timeout)(struct sock *sk,
+					   struct request_sock *req);
 };
 
 /* struct request_sock - mini sock to represent a connection request
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 788c99f98597..87d164b9bd8f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -400,6 +400,8 @@ extern int			compat_tcp_setsockopt(struct sock *sk,
 					int level, int optname,
 					char __user *optval, unsigned int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
+extern void			tcp_syn_ack_timeout(struct sock *sk,
+						    struct request_sock *req);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
 					    size_t len, int nonblock, 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ee16475f8fc3..8da6429269dd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -529,6 +529,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 				syn_ack_recalc(req, thresh, max_retries,
 					       queue->rskq_defer_accept,
 					       &expire, &resend);
+				if (req->rsk_ops->syn_ack_timeout)
+					req->rsk_ops->syn_ack_timeout(parent, req);
 				if (!expire &&
 				    (!resend ||
 				     !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 382f667238ec..356f544c4c10 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -742,9 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  *	This still operates on a request_sock only, not on a big
  *	socket.
  */
-static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
-				struct request_sock *req,
-				struct request_values *rvp)
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+			      struct request_sock *req,
+			      struct request_values *rvp)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	int err = -1;
@@ -775,10 +775,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	return err;
 }
 
-static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 			      struct request_values *rvp)
 {
-	return __tcp_v4_send_synack(sk, NULL, req, rvp);
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+	return tcp_v4_send_synack(sk, NULL, req, rvp);
 }
 
 /*
@@ -1192,10 +1193,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
-	.rtx_syn_ack	=	tcp_v4_send_synack,
+	.rtx_syn_ack	=	tcp_v4_rtx_synack,
 	.send_ack	=	tcp_v4_reqsk_send_ack,
 	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
+	.syn_ack_timeout = 	tcp_syn_ack_timeout,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1373,8 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	}
 	tcp_rsk(req)->snt_isn = isn;
 
-	if (__tcp_v4_send_synack(sk, dst, req,
-				 (struct request_values *)&tmp_ext) ||
+	if (tcp_v4_send_synack(sk, dst, req,
+			       (struct request_values *)&tmp_ext) ||
 	    want_cookie)
 		goto drop_and_free;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8816a20c2597..de7d1bf9114f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -474,6 +474,12 @@ static void tcp_synack_timer(struct sock *sk)
 				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 }
 
+void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
 void tcp_set_keepalive(struct sock *sk, int val)
 {
 	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1c832bf198b3..82f2dea0e39e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -520,6 +520,13 @@ done:
 	return err;
 }
 
+static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
+			     struct request_values *rvp)
+{
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+	return tcp_v6_send_synack(sk, req, rvp);
+}
+
 static inline void syn_flood_warning(struct sk_buff *skb)
 {
 #ifdef CONFIG_SYN_COOKIES
@@ -890,10 +897,11 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
-	.rtx_syn_ack	=	tcp_v6_send_synack,
+	.rtx_syn_ack	=	tcp_v6_rtx_synack,
 	.send_ack	=	tcp_v6_reqsk_send_ack,
 	.destructor	=	tcp_v6_reqsk_destructor,
-	.send_reset	=	tcp_v6_send_reset
+	.send_reset	=	tcp_v6_send_reset,
+	.syn_ack_timeout = 	tcp_syn_ack_timeout,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.3


From 7d720c3e4f0c4fc152a6bf17e24244a3c85412d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 16 Feb 2010 15:20:26 +0000
Subject: percpu: add __percpu sparse annotations to net

Add __percpu sparse annotations to net.

These annotations are to make sparse consider percpu variables to be
in a different address space and warn if accessed without going
through percpu accessors.  This patch doesn't affect normal builds.

The macro and type tricks around snmp stats make things a bit
interesting.  DEFINE/DECLARE_SNMP_STAT() macros mark the target field
as __percpu and SNMP_UPD_PO_STATS() macro is updated accordingly.  All
snmp_mib_*() users which used to cast the argument to (void **) are
updated to cast it to (void __percpu **).

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Vlad Yasevich <vladislav.yasevich@hp.com>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h              |  6 +++---
 include/net/ipcomp.h          |  2 +-
 include/net/neighbour.h       |  2 +-
 include/net/netns/conntrack.h |  2 +-
 include/net/netns/core.h      |  2 +-
 include/net/route.h           |  2 +-
 include/net/snmp.h            | 12 +++++++----
 include/net/tcp.h             |  2 +-
 net/8021q/vlan.h              |  2 +-
 net/dccp/proto.c              |  5 +++--
 net/ipv4/af_inet.c            | 46 +++++++++++++++++++++----------------------
 net/ipv4/proc.c               | 28 +++++++++++++-------------
 net/ipv4/route.c              |  2 +-
 net/ipv4/tcp.c                | 21 +++++++++++---------
 net/ipv6/addrconf.c           | 24 +++++++++++-----------
 net/ipv6/af_inet6.c           | 28 +++++++++++++-------------
 net/ipv6/proc.c               | 23 ++++++++++++----------
 net/sctp/proc.c               |  2 +-
 net/sctp/protocol.c           |  5 +++--
 net/xfrm/xfrm_ipcomp.c        | 16 +++++++--------
 net/xfrm/xfrm_policy.c        |  6 +++---
 net/xfrm/xfrm_proc.c          |  3 ++-
 22 files changed, 127 insertions(+), 114 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/ip.h b/include/net/ip.h
index 7bc47873e3fc..503994a38ed1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -174,9 +174,9 @@ extern struct ipv4_config ipv4_config;
 #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd)
 #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)
 
-extern unsigned long snmp_fold_field(void *mib[], int offt);
-extern int snmp_mib_init(void *ptr[2], size_t mibsize);
-extern void snmp_mib_free(void *ptr[2]);
+extern unsigned long snmp_fold_field(void __percpu *mib[], int offt);
+extern int snmp_mib_init(void __percpu *ptr[2], size_t mibsize);
+extern void snmp_mib_free(void __percpu *ptr[2]);
 
 extern struct local_ports {
 	seqlock_t	lock;
diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h
index 2a1092abaa07..cc4f30cd7315 100644
--- a/include/net/ipcomp.h
+++ b/include/net/ipcomp.h
@@ -9,7 +9,7 @@ struct crypto_comp;
 
 struct ipcomp_data {
 	u16 threshold;
-	struct crypto_comp **tfms;
+	struct crypto_comp * __percpu *tfms;
 };
 
 struct ip_comp_hdr;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7834f470d14a..da1d58be31b7 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -164,7 +164,7 @@ struct neigh_table {
 	rwlock_t		lock;
 	unsigned long		last_rand;
 	struct kmem_cache		*kmem_cachep;
-	struct neigh_statistics	*stats;
+	struct neigh_statistics	__percpu *stats;
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
 	__u32			hash_rnd;
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 63d449807d9b..d4958d4c6574 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -17,7 +17,7 @@ struct netns_ct {
 	struct hlist_head	*expect_hash;
 	struct hlist_nulls_head	unconfirmed;
 	struct hlist_nulls_head	dying;
-	struct ip_conntrack_stat *stat;
+	struct ip_conntrack_stat __percpu *stat;
 	int			sysctl_events;
 	unsigned int		sysctl_events_retry_timeout;
 	int			sysctl_acct;
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 24d4be76bbd1..78eb1ff75475 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -10,7 +10,7 @@ struct netns_core {
 
 	int	sysctl_somaxconn;
 
-	struct prot_inuse	*inuse;
+	struct prot_inuse __percpu *inuse;
 };
 
 #endif
diff --git a/include/net/route.h b/include/net/route.h
index bce6dd68d27b..2c9fba7f7731 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -101,7 +101,7 @@ struct rt_cache_stat {
         unsigned int out_hlist_search;
 };
 
-extern struct ip_rt_acct *ip_rt_acct;
+extern struct ip_rt_acct __percpu *ip_rt_acct;
 
 struct in_device;
 extern int		ip_rt_init(void);
diff --git a/include/net/snmp.h b/include/net/snmp.h
index da02ee027d69..692ee0061dc4 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -129,9 +129,9 @@ struct linux_xfrm_mib {
  * nonlocked_atomic_inc() primitives -AK
  */ 
 #define DEFINE_SNMP_STAT(type, name)	\
-	__typeof__(type) *name[2]
+	__typeof__(type) __percpu *name[2]
 #define DECLARE_SNMP_STAT(type, name)	\
-	extern __typeof__(type) *name[2]
+	extern __typeof__(type) __percpu *name[2]
 
 #define SNMP_STAT_BHPTR(name)	(name[0])
 #define SNMP_STAT_USRPTR(name)	(name[1])
@@ -148,9 +148,13 @@ struct linux_xfrm_mib {
 			__this_cpu_add(mib[0]->mibs[field], addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend)	\
 			this_cpu_add(mib[1]->mibs[field], addend)
+/*
+ * Use "__typeof__(*mib[0]) *ptr" instead of "__typeof__(mib[0]) ptr"
+ * to make @ptr a non-percpu pointer.
+ */
 #define SNMP_UPD_PO_STATS(mib, basefield, addend)	\
 	do { \
-		__typeof__(mib[0]) ptr; \
+		__typeof__(*mib[0]) *ptr; \
 		preempt_disable(); \
 		ptr = this_cpu_ptr((mib)[!in_softirq()]); \
 		ptr->mibs[basefield##PKTS]++; \
@@ -159,7 +163,7 @@ struct linux_xfrm_mib {
 	} while (0)
 #define SNMP_UPD_PO_STATS_BH(mib, basefield, addend)	\
 	do { \
-		__typeof__(mib[0]) ptr = \
+		__typeof__(*mib[0]) *ptr = \
 			__this_cpu_ptr((mib)[!in_softirq()]); \
 		ptr->mibs[basefield##PKTS]++; \
 		ptr->mibs[basefield##OCTETS] += addend;\
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 87d164b9bd8f..75a00c80bdda 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1189,7 +1189,7 @@ extern int			tcp_v4_md5_do_del(struct sock *sk,
 #define tcp_twsk_md5_key(twsk)	NULL
 #endif
 
-extern struct tcp_md5sig_pool	**tcp_alloc_md5sig_pool(struct sock *);
+extern struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *);
 extern void			tcp_free_md5sig_pool(void);
 
 extern struct tcp_md5sig_pool	*__tcp_get_md5sig_pool(int cpu);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 5685296017e9..6abdcac1b2e8 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -61,7 +61,7 @@ struct vlan_dev_info {
 	struct proc_dir_entry			*dent;
 	unsigned long				cnt_inc_headroom_on_tx;
 	unsigned long				cnt_encap_on_xmit;
-	struct vlan_rx_stats			*vlan_rx_stats;
+	struct vlan_rx_stats __percpu		*vlan_rx_stats;
 };
 
 static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 85ec1cb7fd41..0ef7061920c0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1005,12 +1005,13 @@ EXPORT_SYMBOL_GPL(dccp_shutdown);
 
 static inline int dccp_mib_init(void)
 {
-	return snmp_mib_init((void**)dccp_statistics, sizeof(struct dccp_mib));
+	return snmp_mib_init((void __percpu **)dccp_statistics,
+			     sizeof(struct dccp_mib));
 }
 
 static inline void dccp_mib_exit(void)
 {
-	snmp_mib_free((void**)dccp_statistics);
+	snmp_mib_free((void __percpu **)dccp_statistics);
 }
 
 static int thash_entries;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7d12c6a9b19b..33b7dffa7732 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1385,7 +1385,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 }
 EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
 
-unsigned long snmp_fold_field(void *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib[], int offt)
 {
 	unsigned long res = 0;
 	int i;
@@ -1398,7 +1398,7 @@ unsigned long snmp_fold_field(void *mib[], int offt)
 }
 EXPORT_SYMBOL_GPL(snmp_fold_field);
 
-int snmp_mib_init(void *ptr[2], size_t mibsize)
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
 {
 	BUG_ON(ptr == NULL);
 	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
@@ -1416,7 +1416,7 @@ err0:
 }
 EXPORT_SYMBOL_GPL(snmp_mib_init);
 
-void snmp_mib_free(void *ptr[2])
+void snmp_mib_free(void __percpu *ptr[2])
 {
 	BUG_ON(ptr == NULL);
 	free_percpu(ptr[0]);
@@ -1460,25 +1460,25 @@ static const struct net_protocol icmp_protocol = {
 
 static __net_init int ipv4_mib_init_net(struct net *net)
 {
-	if (snmp_mib_init((void **)net->mib.tcp_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
 			  sizeof(struct tcp_mib)) < 0)
 		goto err_tcp_mib;
-	if (snmp_mib_init((void **)net->mib.ip_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
 			  sizeof(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
-	if (snmp_mib_init((void **)net->mib.net_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
 			  sizeof(struct linux_mib)) < 0)
 		goto err_net_mib;
-	if (snmp_mib_init((void **)net->mib.udp_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
 			  sizeof(struct udp_mib)) < 0)
 		goto err_udp_mib;
-	if (snmp_mib_init((void **)net->mib.udplite_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
 			  sizeof(struct udp_mib)) < 0)
 		goto err_udplite_mib;
-	if (snmp_mib_init((void **)net->mib.icmp_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
 			  sizeof(struct icmp_mib)) < 0)
 		goto err_icmp_mib;
-	if (snmp_mib_init((void **)net->mib.icmpmsg_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
 			  sizeof(struct icmpmsg_mib)) < 0)
 		goto err_icmpmsg_mib;
 
@@ -1486,30 +1486,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
 	return 0;
 
 err_icmpmsg_mib:
-	snmp_mib_free((void **)net->mib.icmp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
 err_icmp_mib:
-	snmp_mib_free((void **)net->mib.udplite_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
 err_udplite_mib:
-	snmp_mib_free((void **)net->mib.udp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udp_statistics);
 err_udp_mib:
-	snmp_mib_free((void **)net->mib.net_statistics);
+	snmp_mib_free((void __percpu **)net->mib.net_statistics);
 err_net_mib:
-	snmp_mib_free((void **)net->mib.ip_statistics);
+	snmp_mib_free((void __percpu **)net->mib.ip_statistics);
 err_ip_mib:
-	snmp_mib_free((void **)net->mib.tcp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
 err_tcp_mib:
 	return -ENOMEM;
 }
 
 static __net_exit void ipv4_mib_exit_net(struct net *net)
 {
-	snmp_mib_free((void **)net->mib.icmpmsg_statistics);
-	snmp_mib_free((void **)net->mib.icmp_statistics);
-	snmp_mib_free((void **)net->mib.udplite_statistics);
-	snmp_mib_free((void **)net->mib.udp_statistics);
-	snmp_mib_free((void **)net->mib.net_statistics);
-	snmp_mib_free((void **)net->mib.ip_statistics);
-	snmp_mib_free((void **)net->mib.tcp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.net_statistics);
+	snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+	snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
 }
 
 static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1b09a6dde7c0..242ed2307370 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -280,7 +280,7 @@ static void icmpmsg_put(struct seq_file *seq)
 
 	count = 0;
 	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
-		val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i);
+		val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
 		if (val) {
 			type[count] = i;
 			vals[count++] = val;
@@ -307,18 +307,18 @@ static void icmp_put(struct seq_file *seq)
 	for (i=0; icmpmibmap[i].name != NULL; i++)
 		seq_printf(seq, " Out%s", icmpmibmap[i].name);
 	seq_printf(seq, "\nIcmp: %lu %lu",
-		snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
-		snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
 	for (i=0; icmpmibmap[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			snmp_fold_field((void **) net->mib.icmpmsg_statistics,
+			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
 				icmpmibmap[i].index));
 	seq_printf(seq, " %lu %lu",
-		snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
-		snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
 	for (i=0; icmpmibmap[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			snmp_fold_field((void **) net->mib.icmpmsg_statistics,
+			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
 				icmpmibmap[i].index | 0x100));
 }
 
@@ -341,7 +341,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)net->mib.ip_statistics,
+			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
 					   snmp4_ipstats_list[i].entry));
 
 	icmp_put(seq);	/* RFC 2011 compatibility */
@@ -356,11 +356,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 		/* MaxConn field is signed, RFC 2012 */
 		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
 			seq_printf(seq, " %ld",
-				   snmp_fold_field((void **)net->mib.tcp_statistics,
+				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
 						   snmp4_tcp_list[i].entry));
 		else
 			seq_printf(seq, " %lu",
-				   snmp_fold_field((void **)net->mib.tcp_statistics,
+				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
 						   snmp4_tcp_list[i].entry));
 	}
 
@@ -371,7 +371,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nUdp:");
 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)net->mib.udp_statistics,
+			   snmp_fold_field((void __percpu **)net->mib.udp_statistics,
 					   snmp4_udp_list[i].entry));
 
 	/* the UDP and UDP-Lite MIBs are the same */
@@ -382,7 +382,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nUdpLite:");
 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)net->mib.udplite_statistics,
+			   snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
 					   snmp4_udp_list[i].entry));
 
 	seq_putc(seq, '\n');
@@ -419,7 +419,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nTcpExt:");
 	for (i = 0; snmp4_net_list[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)net->mib.net_statistics,
+			   snmp_fold_field((void __percpu **)net->mib.net_statistics,
 					   snmp4_net_list[i].entry));
 
 	seq_puts(seq, "\nIpExt:");
@@ -429,7 +429,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nIpExt:");
 	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)net->mib.ip_statistics,
+			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
 					   snmp4_ipextstats_list[i].entry));
 
 	seq_putc(seq, '\n');
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b16dfadbe6d6..04762d3bef71 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3334,7 +3334,7 @@ static __net_initdata struct pernet_operations rt_secret_timer_ops = {
 
 
 #ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct *ip_rt_acct __read_mostly;
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_NET_CLS_ROUTE */
 
 static __initdata unsigned long rhash_entries;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5d69ea8f249..e471d037fcc9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2788,10 +2788,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
 
 #ifdef CONFIG_TCP_MD5SIG
 static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool **tcp_md5sig_pool;
+static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
 
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
 {
 	int cpu;
 	for_each_possible_cpu(cpu) {
@@ -2808,7 +2808,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
 
 void tcp_free_md5sig_pool(void)
 {
-	struct tcp_md5sig_pool **pool = NULL;
+	struct tcp_md5sig_pool * __percpu *pool = NULL;
 
 	spin_lock_bh(&tcp_md5sig_pool_lock);
 	if (--tcp_md5sig_users == 0) {
@@ -2822,10 +2822,11 @@ void tcp_free_md5sig_pool(void)
 
 EXPORT_SYMBOL(tcp_free_md5sig_pool);
 
-static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk)
+static struct tcp_md5sig_pool * __percpu *
+__tcp_alloc_md5sig_pool(struct sock *sk)
 {
 	int cpu;
-	struct tcp_md5sig_pool **pool;
+	struct tcp_md5sig_pool * __percpu *pool;
 
 	pool = alloc_percpu(struct tcp_md5sig_pool *);
 	if (!pool)
@@ -2852,9 +2853,9 @@ out_free:
 	return NULL;
 }
 
-struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk)
+struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
 {
-	struct tcp_md5sig_pool **pool;
+	struct tcp_md5sig_pool * __percpu *pool;
 	int alloc = 0;
 
 retry:
@@ -2873,7 +2874,9 @@ retry:
 
 	if (alloc) {
 		/* we cannot hold spinlock here because this may sleep. */
-		struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk);
+		struct tcp_md5sig_pool * __percpu *p;
+
+		p = __tcp_alloc_md5sig_pool(sk);
 		spin_lock_bh(&tcp_md5sig_pool_lock);
 		if (!p) {
 			tcp_md5sig_users--;
@@ -2897,7 +2900,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
 
 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
 {
-	struct tcp_md5sig_pool **p;
+	struct tcp_md5sig_pool * __percpu *p;
 	spin_lock_bh(&tcp_md5sig_pool_lock);
 	p = tcp_md5sig_pool;
 	if (p)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c79cbff54370..b0d4a4b23db5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -278,31 +278,31 @@ static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
 
 static int snmp6_alloc_dev(struct inet6_dev *idev)
 {
-	if (snmp_mib_init((void **)idev->stats.ipv6,
+	if (snmp_mib_init((void __percpu **)idev->stats.ipv6,
 			  sizeof(struct ipstats_mib)) < 0)
 		goto err_ip;
-	if (snmp_mib_init((void **)idev->stats.icmpv6,
+	if (snmp_mib_init((void __percpu **)idev->stats.icmpv6,
 			  sizeof(struct icmpv6_mib)) < 0)
 		goto err_icmp;
-	if (snmp_mib_init((void **)idev->stats.icmpv6msg,
+	if (snmp_mib_init((void __percpu **)idev->stats.icmpv6msg,
 			  sizeof(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg;
 
 	return 0;
 
 err_icmpmsg:
-	snmp_mib_free((void **)idev->stats.icmpv6);
+	snmp_mib_free((void __percpu **)idev->stats.icmpv6);
 err_icmp:
-	snmp_mib_free((void **)idev->stats.ipv6);
+	snmp_mib_free((void __percpu **)idev->stats.ipv6);
 err_ip:
 	return -ENOMEM;
 }
 
 static void snmp6_free_dev(struct inet6_dev *idev)
 {
-	snmp_mib_free((void **)idev->stats.icmpv6msg);
-	snmp_mib_free((void **)idev->stats.icmpv6);
-	snmp_mib_free((void **)idev->stats.ipv6);
+	snmp_mib_free((void __percpu **)idev->stats.icmpv6msg);
+	snmp_mib_free((void __percpu **)idev->stats.icmpv6);
+	snmp_mib_free((void __percpu **)idev->stats.ipv6);
 }
 
 /* Nobody refers to this device, we may destroy it. */
@@ -3766,8 +3766,8 @@ static inline size_t inet6_if_nlmsg_size(void)
 		 );
 }
 
-static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items,
-				      int bytes)
+static inline void __snmp6_fill_stats(u64 *stats, void __percpu **mib,
+				      int items, int bytes)
 {
 	int i;
 	int pad = bytes - sizeof(u64) * items;
@@ -3786,10 +3786,10 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
 {
 	switch(attrtype) {
 	case IFLA_INET6_STATS:
-		__snmp6_fill_stats(stats, (void **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes);
+		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes);
 		break;
 	case IFLA_INET6_ICMP6STATS:
-		__snmp6_fill_stats(stats, (void **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes);
+		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes);
 		break;
 	}
 }
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e29160ff4a0f..37d14e735c27 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -971,41 +971,41 @@ static void ipv6_packet_cleanup(void)
 
 static int __net_init ipv6_init_mibs(struct net *net)
 {
-	if (snmp_mib_init((void **)net->mib.udp_stats_in6,
+	if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6,
 			  sizeof (struct udp_mib)) < 0)
 		return -ENOMEM;
-	if (snmp_mib_init((void **)net->mib.udplite_stats_in6,
+	if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6,
 			  sizeof (struct udp_mib)) < 0)
 		goto err_udplite_mib;
-	if (snmp_mib_init((void **)net->mib.ipv6_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics,
 			  sizeof(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
-	if (snmp_mib_init((void **)net->mib.icmpv6_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics,
 			  sizeof(struct icmpv6_mib)) < 0)
 		goto err_icmp_mib;
-	if (snmp_mib_init((void **)net->mib.icmpv6msg_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.icmpv6msg_statistics,
 			  sizeof(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg_mib;
 	return 0;
 
 err_icmpmsg_mib:
-	snmp_mib_free((void **)net->mib.icmpv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics);
 err_icmp_mib:
-	snmp_mib_free((void **)net->mib.ipv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.ipv6_statistics);
 err_ip_mib:
-	snmp_mib_free((void **)net->mib.udplite_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6);
 err_udplite_mib:
-	snmp_mib_free((void **)net->mib.udp_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.udp_stats_in6);
 	return -ENOMEM;
 }
 
 static void ipv6_cleanup_mibs(struct net *net)
 {
-	snmp_mib_free((void **)net->mib.udp_stats_in6);
-	snmp_mib_free((void **)net->mib.udplite_stats_in6);
-	snmp_mib_free((void **)net->mib.ipv6_statistics);
-	snmp_mib_free((void **)net->mib.icmpv6_statistics);
-	snmp_mib_free((void **)net->mib.icmpv6msg_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udp_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.ipv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpv6msg_statistics);
 }
 
 static int __net_init inet6_net_init(struct net *net)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index bfe2598dd563..58344c0fbd13 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -136,7 +136,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
 	SNMP_MIB_SENTINEL
 };
 
-static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void **mib)
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void __percpu **mib)
 {
 	char name[32];
 	int i;
@@ -170,7 +170,7 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void **mib)
 	return;
 }
 
-static void snmp6_seq_show_item(struct seq_file *seq, void **mib,
+static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **mib,
 				const struct snmp_mib *itemlist)
 {
 	int i;
@@ -183,14 +183,15 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
 
-	snmp6_seq_show_item(seq, (void **)net->mib.ipv6_statistics,
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.ipv6_statistics,
 			    snmp6_ipstats_list);
-	snmp6_seq_show_item(seq, (void **)net->mib.icmpv6_statistics,
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics,
 			    snmp6_icmp6_list);
-	snmp6_seq_show_icmpv6msg(seq, (void **)net->mib.icmpv6msg_statistics);
-	snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6,
+	snmp6_seq_show_icmpv6msg(seq,
+			    (void __percpu **)net->mib.icmpv6msg_statistics);
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6,
 			    snmp6_udp6_list);
-	snmp6_seq_show_item(seq, (void **)net->mib.udplite_stats_in6,
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6,
 			    snmp6_udplite6_list);
 	return 0;
 }
@@ -213,9 +214,11 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
 
 	seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
-	snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list);
-	snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
-	snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg);
+	snmp6_seq_show_item(seq, (void __percpu **)idev->stats.ipv6,
+			    snmp6_ipstats_list);
+	snmp6_seq_show_item(seq, (void __percpu **)idev->stats.icmpv6,
+			    snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq, (void __percpu **)idev->stats.icmpv6msg);
 	return 0;
 }
 
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index a5ac6e0a8d9c..784bcc9a979d 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -83,7 +83,7 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
 
 	for (i = 0; sctp_snmp_list[i].name != NULL; i++)
 		seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
-			   snmp_fold_field((void **)sctp_statistics,
+			   snmp_fold_field((void __percpu **)sctp_statistics,
 				      sctp_snmp_list[i].entry));
 
 	return 0;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a3c8988758b1..9687177b026b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -996,12 +996,13 @@ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family)
 
 static inline int init_sctp_mibs(void)
 {
-	return snmp_mib_init((void**)sctp_statistics, sizeof(struct sctp_mib));
+	return snmp_mib_init((void __percpu **)sctp_statistics,
+			     sizeof(struct sctp_mib));
 }
 
 static inline void cleanup_sctp_mibs(void)
 {
-	snmp_mib_free((void**)sctp_statistics);
+	snmp_mib_free((void __percpu **)sctp_statistics);
 }
 
 static void sctp_v4_pf_init(void)
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 42cd18391f46..0fc5ff66d1fa 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -30,12 +30,12 @@
 
 struct ipcomp_tfms {
 	struct list_head list;
-	struct crypto_comp **tfms;
+	struct crypto_comp * __percpu *tfms;
 	int users;
 };
 
 static DEFINE_MUTEX(ipcomp_resource_mutex);
-static void **ipcomp_scratches;
+static void * __percpu *ipcomp_scratches;
 static int ipcomp_scratch_users;
 static LIST_HEAD(ipcomp_tfms_list);
 
@@ -200,7 +200,7 @@ EXPORT_SYMBOL_GPL(ipcomp_output);
 static void ipcomp_free_scratches(void)
 {
 	int i;
-	void **scratches;
+	void * __percpu *scratches;
 
 	if (--ipcomp_scratch_users)
 		return;
@@ -215,10 +215,10 @@ static void ipcomp_free_scratches(void)
 	free_percpu(scratches);
 }
 
-static void **ipcomp_alloc_scratches(void)
+static void * __percpu *ipcomp_alloc_scratches(void)
 {
 	int i;
-	void **scratches;
+	void * __percpu *scratches;
 
 	if (ipcomp_scratch_users++)
 		return ipcomp_scratches;
@@ -239,7 +239,7 @@ static void **ipcomp_alloc_scratches(void)
 	return scratches;
 }
 
-static void ipcomp_free_tfms(struct crypto_comp **tfms)
+static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms)
 {
 	struct ipcomp_tfms *pos;
 	int cpu;
@@ -267,10 +267,10 @@ static void ipcomp_free_tfms(struct crypto_comp **tfms)
 	free_percpu(tfms);
 }
 
-static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name)
+static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name)
 {
 	struct ipcomp_tfms *pos;
-	struct crypto_comp **tfms;
+	struct crypto_comp * __percpu *tfms;
 	int cpu;
 
 	/* This can be any valid CPU ID so we don't need locking. */
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index eb870fcc29cc..cfceb6616ec1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2428,19 +2428,19 @@ static int __net_init xfrm_statistics_init(struct net *net)
 {
 	int rv;
 
-	if (snmp_mib_init((void **)net->mib.xfrm_statistics,
+	if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics,
 			  sizeof(struct linux_xfrm_mib)) < 0)
 		return -ENOMEM;
 	rv = xfrm_proc_init(net);
 	if (rv < 0)
-		snmp_mib_free((void **)net->mib.xfrm_statistics);
+		snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
 	return rv;
 }
 
 static void xfrm_statistics_fini(struct net *net)
 {
 	xfrm_proc_fini(net);
-	snmp_mib_free((void **)net->mib.xfrm_statistics);
+	snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
 }
 #else
 static int __net_init xfrm_statistics_init(struct net *net)
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index c083a4e4e796..003f2c437ac3 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -50,7 +50,8 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
 	int i;
 	for (i=0; xfrm_mib_list[i].name; i++)
 		seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
-			   snmp_fold_field((void **)net->mib.xfrm_statistics,
+			   snmp_fold_field((void __percpu **)
+					   net->mib.xfrm_statistics,
 					   xfrm_mib_list[i].entry));
 	return 0;
 }
-- 
cgit v1.2.3


From 5aa4b32fc86408705337e941ed716880c63d1590 Mon Sep 17 00:00:00 2001
From: Andreas Petlund <apetlund@simula.no>
Date: Thu, 18 Feb 2010 02:45:45 +0000
Subject: net: TCP thin-stream detection

Inline function to dynamically detect thin streams based on
the number of packets in flight. Used to dynamically trigger
thin-stream mechanisms if enabled by ioctl or sysctl.

Signed-off-by: Andreas Petlund <apetlund@simula.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tcp-thin.txt | 47 +++++++++++++++++++++++++++++++++++
 include/net/tcp.h                     |  8 ++++++
 2 files changed, 55 insertions(+)
 create mode 100644 Documentation/networking/tcp-thin.txt

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/tcp-thin.txt b/Documentation/networking/tcp-thin.txt
new file mode 100644
index 000000000000..151e229980f1
--- /dev/null
+++ b/Documentation/networking/tcp-thin.txt
@@ -0,0 +1,47 @@
+Thin-streams and TCP
+====================
+A wide range of Internet-based services that use reliable transport
+protocols display what we call thin-stream properties. This means
+that the application sends data with such a low rate that the
+retransmission mechanisms of the transport protocol are not fully
+effective. In time-dependent scenarios (like online games, control
+systems, stock trading etc.) where the user experience depends
+on the data delivery latency, packet loss can be devastating for
+the service quality. Extreme latencies are caused by TCP's
+dependency on the arrival of new data from the application to trigger
+retransmissions effectively through fast retransmit instead of
+waiting for long timeouts.
+
+After analysing a large number of time-dependent interactive
+applications, we have seen that they often produce thin streams
+and also stay with this traffic pattern throughout its entire
+lifespan. The combination of time-dependency and the fact that the
+streams provoke high latencies when using TCP is unfortunate.
+
+In order to reduce application-layer latency when packets are lost,
+a set of mechanisms has been made, which address these latency issues
+for thin streams. In short, if the kernel detects a thin stream,
+the retransmission mechanisms are modified in the following manner:
+
+1) If the stream is thin, fast retransmit on the first dupACK.
+2) If the stream is thin, do not apply exponential backoff.
+
+These enhancements are applied only if the stream is detected as
+thin. This is accomplished by defining a threshold for the number
+of packets in flight. If there are less than 4 packets in flight,
+fast retransmissions can not be triggered, and the stream is prone
+to experience high retransmission latencies.
+
+Since these mechanisms are targeted at time-dependent applications,
+they must be specifically activated by the application using the
+TCP_THIN_LINEAR_TIMEOUTS and TCP_THIN_DUPACK IOCTLS or the
+tcp_thin_linear_timeouts and tcp_thin_dupack sysctls. Both
+modifications are turned off by default.
+
+References
+==========
+More information on the modifications, as well as a wide range of
+experimental data can be found here:
+"Improving latency for interactive, thin-stream applications over
+reliable transport"
+http://simula.no/research/nd/publications/Simula.nd.477/simula_pdf_file
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 75a00c80bdda..0bdc3f640247 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1386,6 +1386,14 @@ static inline void tcp_highest_sack_combine(struct sock *sk,
 		tcp_sk(sk)->highest_sack = new;
 }
 
+/* Determines whether this is a thin stream (which may suffer from
+ * increased latency). Used to trigger latency-reducing mechanisms.
+ */
+static inline unsigned int tcp_stream_is_thin(struct tcp_sock *tp)
+{
+	return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
+}
+
 /* /proc */
 enum tcp_seq_states {
 	TCP_SEQ_STATE_LISTENING,
-- 
cgit v1.2.3


From 36e31b0af58728071e8023cf8e20c5166b700717 Mon Sep 17 00:00:00 2001
From: Andreas Petlund <apetlund@simula.no>
Date: Thu, 18 Feb 2010 02:47:01 +0000
Subject: net: TCP thin linear timeouts

This patch will make TCP use only linear timeouts if the
stream is thin. This will help to avoid the very high latencies
that thin stream suffer because of exponential backoff. This
mechanism is only active if enabled by iocontrol or syscontrol
and the stream is identified as thin. A maximum of 6 linear
timeouts is tried before exponential backoff is resumed.

Signed-off-by: Andreas Petlund <apetlund@simula.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ++++++++++++
 include/linux/tcp.h                    |  5 ++++-
 include/net/tcp.h                      |  4 ++++
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp.c                         |  7 +++++++
 net/ipv4/tcp_timer.c                   | 21 ++++++++++++++++++++-
 6 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d97686..f147310d9af4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -487,6 +487,18 @@ tcp_dma_copybreak - INTEGER
 	and CONFIG_NET_DMA is enabled.
 	Default: 4096
 
+tcp_thin_linear_timeouts - BOOLEAN
+	Enable dynamic triggering of linear timeouts for thin streams.
+	If set, a check is performed upon retransmission by timeout to
+	determine if the stream is thin (less than 4 packets in flight).
+	As long as the stream is found to be thin, up to 6 linear
+	timeouts may be performed before exponential backoff mode is
+	initiated. This improves retransmission latency for
+	non-aggressive thin streams, often found to be time-dependent.
+	For more information on thin streams, see
+	Documentation/networking/tcp-thin.txt
+	Default: 0
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7fee8a4df931..3ba8b074612f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -103,6 +103,7 @@ enum {
 #define TCP_CONGESTION		13	/* Congestion control algorithm */
 #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
+#define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
@@ -340,7 +341,9 @@ struct tcp_sock {
 	u32	frto_highmark;	/* snd_nxt when RTO occurred */
 	u16	advmss;		/* Advertised MSS			*/
 	u8	frto_counter;	/* Number of new acks after RTO */
-	u8	nonagle;	/* Disable Nagle algorithm?             */
+	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+		thin_lto    : 1,/* Use linear timeouts for thin streams */
+		unused      : 3;
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0bdc3f640247..6278fc734abd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_NAGLE_CORK		2	/* Socket is corked	    */
 #define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
 
+/* TCP thin-stream limits */
+#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
@@ -241,6 +244,7 @@ extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
+extern int sysctl_tcp_thin_linear_timeouts;
 
 extern atomic_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712ce3994..e6a2460587d4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -575,6 +575,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname       = "tcp_thin_linear_timeouts",
+		.data           = &sysctl_tcp_thin_linear_timeouts,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
 	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e471d037fcc9..21bae9afefea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2229,6 +2229,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		}
 		break;
 
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_lto = val;
+		break;
+
 	case TCP_CORK:
 		/* When set indicates to always queue non-full frames.
 		 * Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index de7d1bf9114f..a17629b8912e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -29,6 +29,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
 int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
 int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
 static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
@@ -415,7 +416,25 @@ void tcp_retransmit_timer(struct sock *sk)
 	icsk->icsk_retransmits++;
 
 out_reset_timer:
-	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+	 * might be increased if the stream oscillates between thin and thick,
+	 * thus the old value might already be too high compared to the value
+	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+	 * exponential backoff behaviour to avoid continue hammering
+	 * linear-timeout retransmissions into a black hole
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED &&
+	    (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+	    tcp_stream_is_thin(tp) &&
+	    icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+		icsk->icsk_backoff = 0;
+		icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+	} else {
+		/* Use normal (exponential) backoff */
+		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
 		__sk_dst_reset(sk);
-- 
cgit v1.2.3


From 7e38017557bc0b87434d184f8804cadb102bb903 Mon Sep 17 00:00:00 2001
From: Andreas Petlund <apetlund@simula.no>
Date: Thu, 18 Feb 2010 04:48:19 +0000
Subject: net: TCP thin dupack

This patch enables fast retransmissions after one dupACK for
TCP if the stream is identified as thin. This will reduce
latencies for thin streams that are not able to trigger fast
retransmissions due to high packet interarrival time. This
mechanism is only active if enabled by iocontrol or syscontrol
and the stream is identified as thin.

Signed-off-by: Andreas Petlund <apetlund@simula.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ++++++++++++
 include/linux/tcp.h                    |  4 +++-
 include/net/tcp.h                      |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp.c                         |  7 +++++++
 net/ipv4/tcp_input.c                   | 12 ++++++++++++
 6 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f147310d9af4..2571a62d923e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -499,6 +499,18 @@ tcp_thin_linear_timeouts - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
+tcp_thin_dupack - BOOLEAN
+	Enable dynamic triggering of retransmissions after one dupACK
+	for thin streams. If set, a check is performed upon reception
+	of a dupACK to determine if the stream is thin (less than 4
+	packets in flight). As long as the stream is found to be thin,
+	data is retransmitted on the first received dupACK. This
+	improves retransmission latency for non-aggressive thin
+	streams, often found to be time-dependent.
+	For more information on thin streams, see
+	Documentation/networking/tcp-thin.txt
+	Default: 0
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3ba8b074612f..a778ee024590 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -104,6 +104,7 @@ enum {
 #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
+#define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
@@ -343,7 +344,8 @@ struct tcp_sock {
 	u8	frto_counter;	/* Number of new acks after RTO */
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		unused      : 3;
+		thin_dupack : 1,/* Fast retransmit on first dupack      */
+		unused      : 2;
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6278fc734abd..56f0aec40ed6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -245,6 +245,7 @@ extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_max_ssthresh;
 extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
+extern int sysctl_tcp_thin_dupack;
 
 extern atomic_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e6a2460587d4..c1bc074f61b7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -582,6 +582,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec
 	},
+        {
+		.procname       = "tcp_thin_dupack",
+		.data           = &sysctl_tcp_thin_dupack,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
 	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 21bae9afefea..5901010fad55 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2236,6 +2236,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			tp->thin_lto = val;
 		break;
 
+	case TCP_THIN_DUPACK:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_dupack = val;
+		break;
+
 	case TCP_CORK:
 		/* When set indicates to always queue non-full frames.
 		 * Later the user clears this option and we transmit
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3fddc69ccccc..788851ca8c5d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -89,6 +89,8 @@ int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
 int sysctl_tcp_nometrics_save __read_mostly;
 
+int sysctl_tcp_thin_dupack __read_mostly;
+
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
 
@@ -2447,6 +2449,16 @@ static int tcp_time_to_recover(struct sock *sk)
 		return 1;
 	}
 
+	/* If a thin stream is detected, retransmit after first
+	 * received dupack. Employ only if SACK is supported in order
+	 * to avoid possible corner-case series of spurious retransmissions
+	 * Use only if there are no unsent data.
+	 */
+	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+	    tcp_is_sack(tp) && !tcp_send_head(sk))
+		return 1;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c839d30a41dd92eb32d7fcfa2b4e99042fc64bf2 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 3 Mar 2010 04:46:50 +0000
Subject: net: add scheduler sync hint to tcp_prequeue().

Decreases the odds wakee will suffer from frequent cache misses.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 56f0aec40ed6..75be5a28815d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -939,7 +939,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 
 		tp->ucopy.memory = 0;
 	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
-		wake_up_interruptible_poll(sk->sk_sleep,
+		wake_up_interruptible_sync_poll(sk->sk_sleep,
 					   POLLIN | POLLRDNORM | POLLRDBAND);
 		if (!inet_csk_ack_scheduled(sk))
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-- 
cgit v1.2.3