From f672258391b42a5c7cc2732c9c063e56a85c8dbe Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 16 Oct 2015 21:57:42 -0700
Subject: tcp: track min RTT using windowed min-filter

Kathleen Nichols' algorithm for tracking the minimum RTT of a
data stream over some measurement window. It uses constant space
and constant time per update. Yet it almost always delivers
the same minimum as an implementation that has to keep all
the data in the window. The measurement window is tunable via
sysctl.net.ipv4.tcp_min_rtt_wlen with a default value of 5 minutes.

The algorithm keeps track of the best, 2nd best & 3rd best min
values, maintaining an invariant that the measurement time of
the n'th best >= n-1'th best. It also makes sure that the three
values are widely separated in the time window since that bounds
the worse case error when that data is monotonically increasing
over the window.

Upon getting a new min, we can forget everything earlier because
it has no value - the new min is less than everything else in the
window by definition and it's the most recent. So we restart fresh
on every new min and overwrites the 2nd & 3rd choices. The same
property holds for the 2nd & 3rd best.

Therefore we have to maintain two invariants to maximize the
information in the samples, one on values (1st.v <= 2nd.v <=
3rd.v) and the other on times (now-win <=1st.t <= 2nd.t <= 3rd.t <=
now). These invariants determine the structure of the code

The RTT input to the windowed filter is the minimum RTT measured
from ACK or SACK, or as the last resort from TCP timestamps.

The accessor tcp_min_rtt() returns the minimum RTT seen in the
window. ~0U indicates it is not available. The minimum is 1usec
even if the true RTT is below that.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 86a7edaa6797..90edef5508f9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -217,6 +217,9 @@ struct tcp_sock {
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
 	u32	rttvar_us;	/* smoothed mdev_max			*/
 	u32	rtt_seq;	/* sequence number to update rttvar	*/
+	struct rtt_meas {
+		u32 rtt, ts;	/* RTT in usec and sampling time in jiffies. */
+	} rtt_min[3];
 
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
-- 
cgit v1.2.3


From af82f4e84866ecd360a53f770d6217637116e6c1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 16 Oct 2015 21:57:43 -0700
Subject: tcp: remove tcp_mark_lost_retrans()

Remove the existing lost retransmit detection because RACK subsumes
it completely. This also stops the overloading the ack_seq field of
the skb control block.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 --
 net/ipv4/tcp_input.c  | 65 ---------------------------------------------------
 net/ipv4/tcp_output.c |  6 -----
 3 files changed, 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 90edef5508f9..8c54863dfc38 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -283,8 +283,6 @@ struct tcp_sock {
 	int     lost_cnt_hint;
 	u32     retransmit_high;	/* L-bits may be on up to this seqno */
 
-	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */
-
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eedb25db3947..5a776897a8c7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1048,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
 	return !before(start_seq, end_seq - tp->max_window);
 }
 
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-	int cnt = 0;
-	u32 new_low_seq = tp->snd_nxt;
-	u32 received_upto = tcp_highest_sack_seq(tp);
-
-	if (!tcp_is_fack(tp) || !tp->retrans_out ||
-	    !after(received_upto, tp->lost_retrans_low) ||
-	    icsk->icsk_ca_state != TCP_CA_Recovery)
-		return;
-
-	tcp_for_write_queue(skb, sk) {
-		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
-		if (skb == tcp_send_head(sk))
-			break;
-		if (cnt == tp->retrans_out)
-			break;
-		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
-			continue;
-
-		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
-			continue;
-
-		/* TODO: We would like to get rid of tcp_is_fack(tp) only
-		 * constraint here (see above) but figuring out that at
-		 * least tp->reordering SACK blocks reside between ack_seq
-		 * and received_upto is not easy task to do cheaply with
-		 * the available datastructures.
-		 *
-		 * Whether FACK should check here for tp->reordering segs
-		 * in-between one could argue for either way (it would be
-		 * rather simple to implement as we could count fack_count
-		 * during the walk and do tp->fackets_out - fack_count).
-		 */
-		if (after(received_upto, ack_seq)) {
-			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
-			tp->retrans_out -= tcp_skb_pcount(skb);
-			*flag |= FLAG_LOST_RETRANS;
-			tcp_skb_mark_lost_uncond_verify(tp, skb);
-			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
-		} else {
-			if (before(ack_seq, new_low_seq))
-				new_low_seq = ack_seq;
-			cnt += tcp_skb_pcount(skb);
-		}
-	}
-
-	if (tp->retrans_out)
-		tp->lost_retrans_low = new_low_seq;
-}
-
 static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 			    struct tcp_sack_block_wire *sp, int num_sacks,
 			    u32 prior_snd_una)
@@ -1838,7 +1774,6 @@ advance_sp:
 	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
 		tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
 
-	tcp_mark_lost_retrans(sk, &state->flag);
 	tcp_verify_left_out(tp);
 out:
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 19adedb8c5cc..f6f7f9b4901b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 			net_dbg_ratelimited("retrans_out leaked\n");
 		}
 #endif
-		if (!tp->retrans_out)
-			tp->lost_retrans_low = tp->snd_nxt;
 		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
 		tp->retrans_out += tcp_skb_pcount(skb);
 
@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (!tp->retrans_stamp)
 			tp->retrans_stamp = tcp_skb_timestamp(skb);
 
-		/* snd_nxt is stored to detect loss of retransmitted segment,
-		 * see tcp_input.c tcp_sacktag_write_queue().
-		 */
-		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
 	} else if (err != -EBUSY) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
-- 
cgit v1.2.3


From 625a5e109a3ed6f36a1008a43069a3462b44a424 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 16 Oct 2015 21:57:45 -0700
Subject: tcp: skb_mstamp_after helper

a helper to prepare the first main RACK patch.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4398411236f1..24f4dfd94c51 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
 	return delta_us;
 }
 
+static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
+				    const struct skb_mstamp *t0)
+{
+	s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
+
+	if (!diff)
+		diff = t1->stamp_us - t0->stamp_us;
+	return diff > 0;
+}
 
 /** 
  *	struct sk_buff - socket buffer
-- 
cgit v1.2.3


From 659a8ad56f490279f0efee43a62ffa1ac914a4e0 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 16 Oct 2015 21:57:46 -0700
Subject: tcp: track the packet timings in RACK

This patch is the first half of the RACK loss recovery.

RACK loss recovery uses the notion of time instead
of packet sequence (FACK) or counts (dupthresh). It's inspired by the
previous FACK heuristic in tcp_mark_lost_retrans(): when a limited
transmit (new data packet) is sacked, then current retransmitted
sequence below the newly sacked sequence must been lost,
since at least one round trip time has elapsed.

But it has several limitations:
1) can't detect tail drops since it depends on limited transmit
2) is disabled upon reordering (assumes no reordering)
3) only enabled in fast recovery ut not timeout recovery

RACK (Recently ACK) addresses these limitations with the notion
of time instead: a packet P1 is lost if a later packet P2 is s/acked,
as at least one round trip has passed.

Since RACK cares about the time sequence instead of the data sequence
of packets, it can detect tail drops when later retransmission is
s/acked while FACK or dupthresh can't. For reordering RACK uses a
dynamically adjusted reordering window ("reo_wnd") to reduce false
positives on ever (small) degree of reordering.

This patch implements tcp_advanced_rack() which tracks the
most recent transmission time among the packets that have been
delivered (ACKed or SACKed) in tp->rack.mstamp. This timestamp
is the key to determine which packet has been lost.

Consider an example that the sender sends six packets:
T1: P1 (lost)
T2: P2
T3: P3
T4: P4
T100: sack of P2. rack.mstamp = T2
T101: retransmit P1
T102: sack of P2,P3,P4. rack.mstamp = T4
T205: ACK of P4 since the hole is repaired. rack.mstamp = T101

We need to be careful about spurious retransmission because it may
falsely advance tp->rack.mstamp by an RTT or an RTO, causing RACK
to falsely mark all packets lost, just like a spurious timeout.

We identify spurious retransmission by the ACK's TS echo value.
If TS option is not applicable but the retransmission is acknowledged
less than min-RTT ago, it is likely to be spurious. We refrain from
using the transmission time of these spurious retransmissions.

The second half is implemented in the next patch that marks packet
lost using RACK timestamp.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  6 ++++++
 include/net/tcp.h        |  5 +++++
 net/ipv4/Makefile        |  1 +
 net/ipv4/tcp_input.c     | 14 ++++++++++++++
 net/ipv4/tcp_minisocks.c |  2 ++
 net/ipv4/tcp_recovery.c  | 32 ++++++++++++++++++++++++++++++++
 6 files changed, 60 insertions(+)
 create mode 100644 net/ipv4/tcp_recovery.c

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8c54863dfc38..5dce9705fe84 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -194,6 +194,12 @@ struct tcp_sock {
 	u32	window_clamp;	/* Maximal window to advertise		*/
 	u32	rcv_ssthresh;	/* Current window clamp			*/
 
+	/* Information of the most recently (s)acked skb */
+	struct tcp_rack {
+		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+		u8 advanced; /* mstamp advanced since last lost marking */
+		u8 reord;    /* reordering detected */
+	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u8	unused;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4a43152229ea..3c3a9fe057d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1750,6 +1750,11 @@ int tcpv4_offload_init(void);
 void tcp_v4_init(void);
 void tcp_init(void);
 
+/* tcp_recovery.c */
+
+extern void tcp_rack_advance(struct tcp_sock *tp,
+			     const struct skb_mstamp *xmit_time, u8 sacked);
+
 /*
  * Save and compile IPv4 options, return a pointer to it
  */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89aacb630a53..c29809f765dc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+	     tcp_recovery.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1e97e73e5ecf..ce8370525832 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1173,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
+		tcp_rack_advance(tp, xmit_time, sacked);
+
 		if (sacked & TCPCB_SACKED_RETRANS) {
 			/* If the segment is not tagged as lost,
 			 * we do not clear RETRANS, believing
@@ -2256,6 +2258,16 @@ static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
 	       before(tp->rx_opt.rcv_tsecr, when);
 }
 
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+				     const struct sk_buff *skb)
+{
+	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
 /* Nothing was retransmitted or returned timestamp is less
  * than timestamp of the first retransmission.
  */
@@ -3135,6 +3147,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 		if (sacked & TCPCB_SACKED_ACKED)
 			tp->sacked_out -= acked_pcount;
+		else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+			tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
 		if (sacked & TCPCB_LOST)
 			tp->lost_out -= acked_pcount;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b875c288daaa..1fd5d413a664 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		tcp_ecn_openreq_child(newtp, req);
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
+		newtp->rack.mstamp.v64 = 0;
+		newtp->rack.advanced = 0;
 
 		newtp->saved_syn = req->saved_syn;
 		req->saved_syn = NULL;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..8f66a6584845
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,32 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+		      const struct skb_mstamp *xmit_time, u8 sacked)
+{
+	if (tp->rack.mstamp.v64 &&
+	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+		return;
+
+	if (sacked & TCPCB_RETRANS) {
+		struct skb_mstamp now;
+
+		/* If the sacked packet was retransmitted, it's ambiguous
+		 * whether the retransmission or the original (or the prior
+		 * retransmission) was sacked.
+		 *
+		 * If the original is lost, there is no ambiguity. Otherwise
+		 * we assume the original can be delayed up to aRTT + min_rtt.
+		 * the aRTT term is bounded by the fast recovery or timeout,
+		 * so it's at least one RTT (i.e., retransmission is at least
+		 * an RTT later).
+		 */
+		skb_mstamp_get(&now);
+		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+			return;
+	}
+
+	tp->rack.mstamp = *xmit_time;
+	tp->rack.advanced = 1;
+}
-- 
cgit v1.2.3