From 5f852eb536ad651b8734559dcf4353514cb0bea3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Feb 2015 14:10:18 -0800
Subject: tcp: tso: remove tp->tso_deferred

TSO relies on ability to defer sending a small amount of packets.
Heuristic is to wait for future ACKS in hope to send more packets at once.
Current algorithm uses a per socket tso_deferred field as a pseudo timer.

This pseudo timer relies on future ACK, but there is no guarantee
we receive them in time.

Fix would be to use a real timer, but cost of such timer is probably too
expensive for typical cases.

This patch changes the logic to test the time of last transmit,
because we should not add bursts of more than 1ms for any given flow.

We've used this patch for about two years at Google, before FQ/pacing
as it would reduce a fair amount of bursts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1a7adb411647..97dbf16f7d9d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -236,7 +236,6 @@ struct tcp_sock {
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
 	u32	fackets_out;	/* FACK'd packets			*/
-	u32	tso_deferred;
 
 	/* from STCP, retrans queue hinting */
 	struct sk_buff* lost_skb_hint;
-- 
cgit v1.2.3


From 9439ce00f208d95703a6725e4ea986dd90e37ffd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Mar 2015 18:32:29 -0700
Subject: tcp: rename struct tcp_request_sock listener

The listener field in struct tcp_request_sock is a pointer
back to the listener. We now have req->rsk_listener, so TCP
only needs one boolean and not a full pointer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h             |  2 +-
 net/core/request_sock.c         | 18 +++++++-----------
 net/ipv4/inet_connection_sock.c |  7 +++----
 net/ipv4/syncookies.c           |  2 +-
 net/ipv4/tcp_fastopen.c         |  7 +------
 net/ipv4/tcp_input.c            |  2 +-
 net/ipv6/syncookies.c           |  2 +-
 7 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97dbf16f7d9d..f869ae8afbaf 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -111,7 +111,7 @@ struct tcp_request_sock_ops;
 struct tcp_request_sock {
 	struct inet_request_sock 	req;
 	const struct tcp_request_sock_ops *af_specific;
-	struct sock			*listener; /* needed for TFO */
+	bool				tfo_listener;
 	u32				rcv_isn;
 	u32				snt_isn;
 	u32				snt_synack; /* synack sent time */
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index e910317ef6d9..cc39a2aa663a 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -153,24 +153,22 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
  * case might also exist in tcp_v4_hnd_req() that will trigger this locking
  * order.
  *
- * When a TFO req is created, it needs to sock_hold its listener to prevent
- * the latter data structure from going away.
- *
- * This function also sets "treq->listener" to NULL and unreference listener
- * socket. treq->listener is used by the listener so it is protected by the
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
  * fastopenq->lock in this function.
  */
 void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 			   bool reset)
 {
-	struct sock *lsk = tcp_rsk(req)->listener;
-	struct fastopen_queue *fastopenq =
-	    inet_csk(lsk)->icsk_accept_queue.fastopenq;
+	struct sock *lsk = req->rsk_listener;
+	struct fastopen_queue *fastopenq;
+
+	fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq;
 
 	tcp_sk(sk)->fastopen_rsk = NULL;
 	spin_lock_bh(&fastopenq->lock);
 	fastopenq->qlen--;
-	tcp_rsk(req)->listener = NULL;
+	tcp_rsk(req)->tfo_listener = false;
 	if (req->sk)	/* the child socket hasn't been accepted yet */
 		goto out;
 
@@ -179,7 +177,6 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 		 * special RST handling below.
 		 */
 		spin_unlock_bh(&fastopenq->lock);
-		sock_put(lsk);
 		reqsk_put(req);
 		return;
 	}
@@ -201,5 +198,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 	fastopenq->qlen++;
 out:
 	spin_unlock_bh(&fastopenq->lock);
-	sock_put(lsk);
 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3390ba6f96b2..741f0d96a7f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -325,7 +325,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 	sk_acceptq_removed(sk);
 	if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
 		spin_lock_bh(&queue->fastopenq->lock);
-		if (tcp_rsk(req)->listener) {
+		if (tcp_rsk(req)->tfo_listener) {
 			/* We are still waiting for the final ACK from 3WHS
 			 * so can't free req now. Instead, we set req->sk to
 			 * NULL to signify that the child socket is taken
@@ -817,9 +817,9 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		percpu_counter_inc(sk->sk_prot->orphan_count);
 
-		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
-			BUG_ON(sk != tcp_rsk(req)->listener);
+			BUG_ON(sk != req->rsk_listener);
 
 			/* Paranoid, to prevent race condition if
 			 * an inbound pkt destined for child is
@@ -828,7 +828,6 @@ void inet_csk_listen_stop(struct sock *sk)
 			 * tcp_v4_destroy_sock().
 			 */
 			tcp_sk(child)->fastopen_rsk = NULL;
-			sock_put(sk);
 		}
 		inet_csk_destroy_sock(child);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eb940750bb1b..574b67765a06 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -345,7 +345,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
-	treq->listener		= NULL;
+	treq->tfo_listener	= false;
 	ireq->ireq_family = AF_INET;
 
 	ireq->ir_iif = sk->sk_bound_dev_if;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 84381319e1bc..186fd394ec0a 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
 	tp = tcp_sk(child);
 
 	tp->fastopen_rsk = req;
-	/* Do a hold on the listner sk so that if the listener is being
-	 * closed, the child that has been accepted can live on and still
-	 * access listen_lock.
-	 */
-	sock_hold(sk);
-	tcp_rsk(req)->listener = sk;
+	tcp_rsk(req)->tfo_listener = true;
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never
 	 * scaled. So correct it appropriately.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fbe518981d36..a94ddb96fc85 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6120,7 +6120,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		if (err || want_cookie)
 			goto drop_and_free;
 
-		tcp_rsk(req)->listener = NULL;
+		tcp_rsk(req)->tfo_listener = false;
 		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 039e74dd29fe..1ef0c926ce9d 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -195,7 +195,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	ireq = inet_rsk(req);
 	treq = tcp_rsk(req);
-	treq->listener = NULL;
+	treq->tfo_listener = false;
 	ireq->ireq_family = AF_INET6;
 
 	if (security_inet_conn_request(sk, skb, req))
-- 
cgit v1.2.3


From 7f9b838b71eb78a27de27a12ca5de8542fac3115 Mon Sep 17 00:00:00 2001
From: Daniel Lee <Longinus00@gmail.com>
Date: Mon, 6 Apr 2015 14:37:26 -0700
Subject: tcp: RFC7413 option support for Fast Open server

Fast Open has been using the experimental option with a magic number
(RFC6994) to request and grant Fast Open cookies. This patch enables
the server to support the official IANA option 34 in RFC7413 in
addition.

The change has passed all existing Fast Open tests with both
old and new options at Google.

Signed-off-by: Daniel Lee <Longinus00@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  1 +
 include/net/tcp.h       |  2 ++
 net/ipv4/tcp_fastopen.c |  1 +
 net/ipv4/tcp_input.c    | 42 ++++++++++++++++++++++++++++++------------
 net/ipv4/tcp_output.c   | 34 +++++++++++++++++++++++-----------
 5 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f869ae8afbaf..a48d00318683 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -58,6 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 struct tcp_fastopen_cookie {
 	s8	len;
 	u8	val[TCP_FASTOPEN_COOKIE_MAX];
+	bool	exp;	/* In RFC6994 experimental option format */
 };
 
 /* This defines a selective acknowledgement block. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 963303fb96ae..7292c3c575bc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -179,6 +179,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
+#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
 #define TCPOPT_EXP		254	/* Experimental */
 /* Magic number to be after the option value for sharing TCP
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -194,6 +195,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERM      2
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
+#define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
 
 /* But this is what stacks really send out. */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 5da55e2b5cd2..e3d87aca6be8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -303,6 +303,7 @@ fastopen:
 	} else if (foc->len > 0) /* Client presents an invalid cookie */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
 
+	valid_foc.exp = foc->exp;
 	*foc = valid_foc;
 	return false;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c1ce304ba8d2..24f1630b2afb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3603,6 +3603,23 @@ old_ack:
 	return 0;
 }
 
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+				      bool syn, struct tcp_fastopen_cookie *foc,
+				      bool exp_opt)
+{
+	/* Valid only in SYN or SYN-ACK with an even length.  */
+	if (!foc || !syn || len < 0 || (len & 1))
+		return;
+
+	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+	    len <= TCP_FASTOPEN_COOKIE_MAX)
+		memcpy(foc->val, cookie, len);
+	else if (len != 0)
+		len = -1;
+	foc->len = len;
+	foc->exp = exp_opt;
+}
+
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3692,21 +3709,22 @@ void tcp_parse_options(const struct sk_buff *skb,
 				 */
 				break;
 #endif
+			case TCPOPT_FASTOPEN:
+				tcp_parse_fastopen_option(
+					opsize - TCPOLEN_FASTOPEN_BASE,
+					ptr, th->syn, foc, false);
+				break;
+
 			case TCPOPT_EXP:
 				/* Fast Open option shares code 254 using a
-				 * 16 bits magic number. It's valid only in
-				 * SYN or SYN-ACK with an even size.
+				 * 16 bits magic number.
 				 */
-				if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
-				    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
-				    !foc || !th->syn || (opsize & 1))
-					break;
-				foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
-				if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
-				    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
-					memcpy(foc->val, ptr + 2, foc->len);
-				else if (foc->len != 0)
-					foc->len = -1;
+				if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+				    get_unaligned_be16(ptr) ==
+				    TCPOPT_FASTOPEN_MAGIC)
+					tcp_parse_fastopen_option(opsize -
+						TCPOLEN_EXP_FASTOPEN_BASE,
+						ptr + 2, th->syn, foc, true);
 				break;
 
 			}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7404e5238e00..464bd8c5de69 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
 	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
 		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+		u8 *p = (u8 *)ptr;
+		u32 len; /* Fast Open option length */
+
+		if (foc->exp) {
+			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+				     TCPOPT_FASTOPEN_MAGIC);
+			p += TCPOLEN_EXP_FASTOPEN_BASE;
+		} else {
+			len = TCPOLEN_FASTOPEN_BASE + foc->len;
+			*p++ = TCPOPT_FASTOPEN;
+			*p++ = len;
+		}
 
-		*ptr++ = htonl((TCPOPT_EXP << 24) |
-			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
-			       TCPOPT_FASTOPEN_MAGIC);
-
-		memcpy(ptr, foc->val, foc->len);
-		if ((foc->len & 3) == 2) {
-			u8 *align = ((u8 *)ptr) + foc->len;
-			align[0] = align[1] = TCPOPT_NOP;
+		memcpy(p, foc->val, foc->len);
+		if ((len & 3) == 2) {
+			p[foc->len] = TCPOPT_NOP;
+			p[foc->len + 1] = TCPOPT_NOP;
 		}
-		ptr += (foc->len + 3) >> 2;
+		ptr += (len + 3) >> 2;
 	}
 }
 
@@ -641,8 +650,11 @@ static unsigned int tcp_synack_options(struct sock *sk,
 		if (unlikely(!ireq->tstamp_ok))
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
-	if (foc && foc->len >= 0) {
-		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+	if (foc != NULL && foc->len >= 0) {
+		u32 need = foc->len;
+
+		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+				   TCPOLEN_FASTOPEN_BASE;
 		need = (need + 3) & ~3U;  /* Align to 32 bits */
 		if (remaining >= need) {
 			opts->options |= OPTION_FAST_OPEN_COOKIE;
-- 
cgit v1.2.3


From 2646c831c00c5d22aa72b79d24069c1b412cda7c Mon Sep 17 00:00:00 2001
From: Daniel Lee <Longinus00@gmail.com>
Date: Mon, 6 Apr 2015 14:37:27 -0700
Subject: tcp: RFC7413 option support for Fast Open client

Fast Open has been using an experimental option with a magic number
(RFC6994). This patch makes the client by default use the RFC7413
option (34) to get and send Fast Open cookies.  This patch makes
the client solicit cookies from a given server first with the
RFC7413 option. If that fails to elicit a cookie, then it tries
the RFC6994 experimental option. If that also fails, it uses the
RFC7413 option on all subsequent connect attempts.  If the server
returns a Fast Open cookie then the client caches the form of the
option that successfully elicited a cookie, and uses that form on
later connects when it presents that cookie.

The idea is to gradually obsolete the use of experimental options as
the servers and clients upgrade, while keeping the interoperability
meanwhile.

Signed-off-by: Daniel Lee <Longinus00@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h    |  1 +
 include/net/tcp.h      |  3 ++-
 net/ipv4/tcp_input.c   | 29 +++++++++++++++++++----------
 net/ipv4/tcp_metrics.c | 13 +++++++++++--
 net/ipv4/tcp_output.c  |  6 +++++-
 net/ipv4/tcp_timer.c   |  2 +-
 6 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a48d00318683..0caa3a2d4106 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -189,6 +189,7 @@ struct tcp_sock {
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
+		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7292c3c575bc..9598871485ce 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1339,7 +1339,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			    struct tcp_fastopen_cookie *cookie, int *syn_loss,
 			    unsigned long *last_syn_loss);
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-			    struct tcp_fastopen_cookie *cookie, bool syn_lost);
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
+			    u16 try_exp);
 struct tcp_fastopen_request {
 	/* Fast Open cookie. Size 0 means a cookie request */
 	struct tcp_fastopen_cookie	cookie;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 24f1630b2afb..031cf72cd05c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5378,8 +5378,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
-	u16 mss = tp->rx_opt.mss_clamp;
-	bool syn_drop;
+	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+	bool syn_drop = false;
 
 	if (mss == tp->rx_opt.user_mss) {
 		struct tcp_options_received opt;
@@ -5391,16 +5391,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
-	if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
+	if (!tp->syn_fastopen) {
+		/* Ignore an unsolicited cookie */
 		cookie->len = -1;
+	} else if (tp->total_retrans) {
+		/* SYN timed out and the SYN-ACK neither has a cookie nor
+		 * acknowledges data. Presumably the remote received only
+		 * the retransmitted (regular) SYNs: either the original
+		 * SYN-data or the corresponding SYN-ACK was dropped.
+		 */
+		syn_drop = (cookie->len < 0 && data);
+	} else if (cookie->len < 0 && !tp->syn_data) {
+		/* We requested a cookie but didn't get it. If we did not use
+		 * the (old) exp opt format then try so next time (try_exp=1).
+		 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+		 */
+		try_exp = tp->syn_fastopen_exp ? 2 : 1;
+	}
 
-	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
-	 * the remote receives only the retransmitted (regular) SYNs: either
-	 * the original SYN-data or the corresponding SYN-ACK is lost.
-	 */
-	syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
-
-	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
 	if (data) { /* Retransmit unacked data in SYN */
 		tcp_for_write_queue_from(data, sk) {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 78ecc4a01712..a51d63a43e33 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -28,7 +28,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
 
 struct tcp_fastopen_metrics {
 	u16	mss;
-	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
+	u16	syn_loss:10,		/* Recurring Fast Open SYN losses */
+		try_exp:2;		/* Request w/ exp. option (once) */
 	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
 	struct	tcp_fastopen_cookie	cookie;
 };
@@ -131,6 +132,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
 	if (fastopen_clear) {
 		tm->tcpm_fastopen.mss = 0;
 		tm->tcpm_fastopen.syn_loss = 0;
+		tm->tcpm_fastopen.try_exp = 0;
+		tm->tcpm_fastopen.cookie.exp = false;
 		tm->tcpm_fastopen.cookie.len = 0;
 	}
 }
@@ -713,6 +716,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			if (tfom->mss)
 				*mss = tfom->mss;
 			*cookie = tfom->cookie;
+			if (cookie->len <= 0 && tfom->try_exp == 1)
+				cookie->exp = true;
 			*syn_loss = tfom->syn_loss;
 			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
 		} while (read_seqretry(&fastopen_seqlock, seq));
@@ -721,7 +726,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 }
 
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-			    struct tcp_fastopen_cookie *cookie, bool syn_lost)
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
+			    u16 try_exp)
 {
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_metrics_block *tm;
@@ -738,6 +744,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 			tfom->mss = mss;
 		if (cookie && cookie->len > 0)
 			tfom->cookie = *cookie;
+		else if (try_exp > tfom->try_exp &&
+			 tfom->cookie.len <= 0 && !tfom->cookie.exp)
+			tfom->try_exp = try_exp;
 		if (syn_lost) {
 			++tfom->syn_loss;
 			tfom->last_syn_loss = jiffies;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 464bd8c5de69..e662d85d1635 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -592,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	}
 
 	if (fastopen && fastopen->cookie.len >= 0) {
-		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+		u32 need = fastopen->cookie.len;
+
+		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+					       TCPOLEN_FASTOPEN_BASE;
 		need = (need + 3) & ~3U;  /* Align to 32 bits */
 		if (remaining >= need) {
 			opts->options |= OPTION_FAST_OPEN_COOKIE;
 			opts->fastopen_cookie = &fastopen->cookie;
 			remaining -= need;
 			tp->syn_fastopen = 1;
+			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
 		}
 	}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2568fd282873..8c65dc147d8b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -167,7 +167,7 @@ static int tcp_write_timeout(struct sock *sk)
 		if (icsk->icsk_retransmits) {
 			dst_negative_advice(sk);
 			if (tp->syn_fastopen || tp->syn_data)
-				tcp_fastopen_cache_set(sk, 0, NULL, true);
+				tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
 			if (tp->syn_data)
 				NET_INC_STATS_BH(sock_net(sk),
 						 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
-- 
cgit v1.2.3


From 0df48c26d8418c5c9fba63fac15b660d70ca2f1c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Apr 2015 15:28:17 -0700
Subject: tcp: add tcpi_bytes_acked to tcp_info

This patch tracks total number of bytes acked for a TCP socket.
This is the sum of all changes done to tp->snd_una, and allows
for precise tracking of delivered data.

RFC4898 named this : tcpEStatsAppHCThruOctetsAcked

This is a 64bit field, and can be fetched both from TCP_INFO
getsockopt() if one has a handle on a TCP socket, or from inet_diag
netlink facility (iproute2/ss patch will follow)

Note that tp->bytes_acked was placed near tp->snd_una for
best data locality and minimal performance impact.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Eric Salo <salo@google.com>
Cc: Martin Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 include/net/tcp.h        |  2 +-
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/tcp.c           |  6 +++++-
 net/ipv4/tcp_input.c     | 13 +++++++++++--
 5 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0caa3a2d4106..0f73b43171da 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -150,6 +150,10 @@ struct tcp_sock {
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
  	u32	snd_nxt;	/* Next sequence we send		*/
 
+	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
+				 * sum(delta(snd_una)), or how many bytes
+				 * were acked.
+				 */
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 051dc5c2802d..dd7b4ea6a10c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -576,7 +576,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
 }
 
 /* tcp.c */
-void tcp_get_info(const struct sock *, struct tcp_info *);
+void tcp_get_info(struct sock *, struct tcp_info *);
 
 /* Read 'sendfile()'-style from a TCP socket */
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 3b9718328d8b..6666e98a0af9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -189,6 +189,7 @@ struct tcp_info {
 
 	__u64	tcpi_pacing_rate;
 	__u64	tcpi_max_pacing_rate;
+	__u64	tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c5cd9efebbc..4bf0e8ca7b5b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2592,7 +2592,7 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
 #endif
 
 /* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(const struct sock *sk, struct tcp_info *info)
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2663,6 +2663,10 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 
 	rate = READ_ONCE(sk->sk_max_pacing_rate);
 	info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	info->tcpi_bytes_acked = tp->bytes_acked;
+	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a4d9b34bed4..378d3f4d4dc3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3280,6 +3280,15 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+	u32 delta = ack - tp->snd_una;
+
+	tp->bytes_acked += delta;
+	tp->snd_una = ack;
+}
+
 /* Update our send window.
  *
  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3315,7 +3324,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 		}
 	}
 
-	tp->snd_una = ack;
+	tcp_snd_una_update(tp, ack);
 
 	return flag;
 }
@@ -3497,7 +3506,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		 * Note, we use the fact that SND.UNA>=SND.WL2.
 		 */
 		tcp_update_wl(tp, ack_seq);
-		tp->snd_una = ack;
+		tcp_snd_una_update(tp, ack);
 		flag |= FLAG_WIN_UPDATE;
 
 		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-- 
cgit v1.2.3


From bdd1f9edacb5f5835d1e6276571bbbe5b88ded48 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Apr 2015 15:28:18 -0700
Subject: tcp: add tcpi_bytes_received to tcp_info

This patch tracks total number of payload bytes received on a TCP socket.
This is the sum of all changes done to tp->rcv_nxt

RFC4898 named this : tcpEStatsAppHCThruOctetsReceived

This is a 64bit field, and can be fetched both from TCP_INFO
getsockopt() if one has a handle on a TCP socket, or from inet_diag
netlink facility (iproute2/ss patch will follow)

Note that tp->bytes_received was placed near tp->rcv_nxt for
best data locality and minimal performance impact.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Eric Salo <salo@google.com>
Cc: Martin Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/tcp.c           |  1 +
 net/ipv4/tcp_fastopen.c  |  1 +
 net/ipv4/tcp_input.c     | 17 +++++++++++++----
 5 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0f73b43171da..3b2911502a8c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -145,6 +145,10 @@ struct tcp_sock {
  *	read the code and the spec side by side (and laugh ...)
  *	See RFC793 and RFC1122. The RFC writes these in capitals.
  */
+	u64	bytes_received;	/* RFC4898 tcpEStatsAppHCThruOctetsReceived
+				 * sum(delta(rcv_nxt)), or how many bytes
+				 * were acked.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 6666e98a0af9..a48f93f3207b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -190,6 +190,7 @@ struct tcp_info {
 	__u64	tcpi_pacing_rate;
 	__u64	tcpi_max_pacing_rate;
 	__u64	tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4bf0e8ca7b5b..99fcc0b22c92 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2666,6 +2666,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	spin_lock_bh(&sk->sk_lock.slock);
 	info->tcpi_bytes_acked = tp->bytes_acked;
+	info->tcpi_bytes_received = tp->bytes_received;
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3d87aca6be8..3c673d5e6cff 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -206,6 +206,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
 			skb_set_owner_r(skb2, child);
 			__skb_queue_tail(&child->sk_receive_queue, skb2);
 			tp->syn_data_acked = 1;
+			tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
 		} else {
 			end_seq = TCP_SKB_CB(skb)->seq + 1;
 		}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 378d3f4d4dc3..7e6962bcfc30 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3289,6 +3289,15 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 	tp->snd_una = ack;
 }
 
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+	u32 delta = seq - tp->rcv_nxt;
+
+	tp->bytes_received += delta;
+	tp->rcv_nxt = seq;
+}
+
 /* Update our send window.
  *
  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -4245,7 +4254,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
-		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		if (!eaten)
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4413,7 +4422,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 	__skb_pull(skb, hdrlen);
 	eaten = (tail &&
 		 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
-	tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
 	if (!eaten) {
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		skb_set_owner_r(skb, sk);
@@ -4506,7 +4515,7 @@ queue_and_out:
 
 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
 		}
-		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		if (skb->len)
 			tcp_event_data_recv(sk, skb);
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5254,7 +5263,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					tcp_rcv_rtt_measure_ts(sk, skb);
 
 					__skb_pull(skb, tcp_header_len);
-					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+					tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
 					eaten = 1;
 				}
-- 
cgit v1.2.3