5 files changed, 109 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 502d6a572b4f..85752c81c5ec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER
 	you should think about lowering this value, such sockets
 	may consume significant resources. Cf. tcp_max_orphans.
 
+tcp_recovery - INTEGER
+	This value is a bitmap to enable various experimental loss recovery
+	features.
+
+	RACK: 0x1 enables the RACK loss detection for fast detection of lost
+	      retransmissions and tail drops.
+
+	Default: 0x1
+
 tcp_reordering - INTEGER
 	Initial reordering level of packets in a TCP stream.
 	TCP stack can then dynamically adjust flow reordering level
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c3a9fe057d3..11e320412216 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
 
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
@@ -1752,6 +1753,14 @@ void tcp_init(void);
 
 /* tcp_recovery.c */
 
+/* Flags to enable various loss recovery features. See below */
+extern int sysctl_tcp_recovery;
+
+/* Use TCP RACK to detect (some) tail and retransmit losses */
+#define TCP_RACK_LOST_RETRANS  0x1
+
+extern int tcp_rack_mark_lost(struct sock *sk);
+
 extern void tcp_rack_advance(struct tcp_sock *tp,
 			     const struct skb_mstamp *xmit_time, u8 sacked);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 13ab434c2909..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
+		.procname	= "tcp_recovery",
+		.data		= &sysctl_tcp_recovery,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "tcp_reordering",
 		.data		= &sysctl_tcp_reordering,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce8370525832..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 
 	if (metric > 0)
 		tcp_disable_early_retrans(tp);
+	tp->rack.reord = 1;
 }
 
 /* This must be called before lost_out is incremented */
@@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 	}
 }
 
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
-					    struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	tcp_verify_retransmit_hint(tp, skb);
 
@@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 		}
 	}
 
+	/* Use RACK to detect loss */
+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+	    tcp_rack_mark_lost(sk))
+		flag |= FLAG_LOST_RETRANS;
+
 	/* E. Process state. */
 	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 8f66a6584845..5353085fd0b2 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,6 +1,83 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+		return 0;
+
+	/* Reset the advanced flag to avoid unnecessary queue scanning */
+	tp->rack.advanced = 0;
+
+	/* To be more reordering resilient, allow min_rtt/4 settling delay
+	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+	 * RTT because reordering is often a path property and less related
+	 * to queuing or delayed ACKs.
+	 *
+	 * TODO: measure and adapt to the observed reordering delay, and
+	 * use a timer to retransmit like the delayed early retransmit.
+	 */
+	reo_wnd = 1000;
+	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+	tcp_for_write_queue(skb, sk) {
+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+		if (skb == tcp_send_head(sk))
+			break;
+
+		/* Skip ones already (s)acked */
+		if (!after(scb->end_seq, tp->snd_una) ||
+		    scb->sacked & TCPCB_SACKED_ACKED)
+			continue;
+
+		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+			if (skb_mstamp_us_delta(&tp->rack.mstamp,
+						&skb->skb_mstamp) <= reo_wnd)
+				continue;
+
+			/* skb is lost if packet sent later is sacked */
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
+			if (scb->sacked & TCPCB_SACKED_RETRANS) {
+				scb->sacked &= ~TCPCB_SACKED_RETRANS;
+				tp->retrans_out -= tcp_skb_pcount(skb);
+				NET_INC_STATS_BH(sock_net(sk),
+						 LINUX_MIB_TCPLOSTRETRANSMIT);
+			}
+		} else if (!(scb->sacked & TCPCB_RETRANS)) {
+			/* Original data are sent sequentially so stop early
+			 * b/c the rest are all sent after rack_sent
+			 */
+			break;
+		}
+	}
+	return prior_retrans - tp->retrans_out;
+}
+
 /* Record the most recently (re)sent time among the (s)acked packets */
 void tcp_rack_advance(struct tcp_sock *tp,
 		      const struct skb_mstamp *xmit_time, u8 sacked)