diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 9 | ||||
-rw-r--r-- | include/net/tcp.h | 9 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp_recovery.c | 77 |
5 files changed, 109 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 502d6a572b4f..85752c81c5ec 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER you should think about lowering this value, such sockets may consume significant resources. Cf. tcp_max_orphans. +tcp_recovery - INTEGER + This value is a bitmap to enable various experimental loss recovery + features. + + RACK: 0x1 enables the RACK loss detection for fast detection of lost + retransmissions and tail drops. + + Default: 0x1 + tcp_reordering - INTEGER Initial reordering level of packets in a TCP stream. TCP stack can then dynamically adjust flow reordering level diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c3a9fe057d3..11e320412216 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk); void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); @@ -1752,6 +1753,14 @@ void tcp_init(void); /* tcp_recovery.c */ +/* Flags to enable various loss recovery features. See below */ +extern int sysctl_tcp_recovery; + +/* Use TCP RACK to detect (some) tail and retransmit losses */ +#define TCP_RACK_LOST_RETRANS 0x1 + +extern int tcp_rack_mark_lost(struct sock *sk); + extern void tcp_rack_advance(struct tcp_sock *tp, const struct skb_mstamp *xmit_time, u8 sacked); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 13ab434c2909..25300c5e283b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_recovery", + .data = &sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ce8370525832..fdd88c3803a6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > 0) tcp_disable_early_retrans(tp); + tp->rack.reord = 1; } /* This must be called before lost_out is incremented */ @@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, - struct sk_buff *skb) +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && + tcp_rack_mark_lost(sk)) + flag |= FLAG_LOST_RETRANS; + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 8f66a6584845..5353085fd0b2 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,6 +1,83 @@ #include <linux/tcp.h> #include <net/tcp.h> +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; + +/* Marks a packet lost, if some packet sent later has been (s)acked. + * The underlying idea is similar to the traditional dupthresh and FACK + * but they look at different metrics: + * + * dupthresh: 3 OOO packets delivered (packet count) + * FACK: sequence delta to highest sacked sequence (sequence space) + * RACK: sent time delta to the latest delivered packet (time domain) + * + * The advantage of RACK is it applies to both original and retransmitted + * packet and therefore is robust against tail losses. Another advantage + * is being more resilient to reordering by simply allowing some + * "settling delay", instead of tweaking the dupthresh. + * + * The current version is only used after recovery starts but can be + * easily extended to detect the first loss. + */ +int tcp_rack_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + u32 reo_wnd, prior_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + return 0; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + + /* To be more reordering resilient, allow min_rtt/4 settling delay + * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed + * RTT because reordering is often a path property and less related + * to queuing or delayed ACKs. + * + * TODO: measure and adapt to the observed reordering delay, and + * use a timer to retransmit like the delayed early retransmit. + */ + reo_wnd = 1000; + if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + + tcp_for_write_queue(skb, sk) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (skb == tcp_send_head(sk)) + break; + + /* Skip ones already (s)acked */ + if (!after(scb->end_seq, tp->snd_una) || + scb->sacked & TCPCB_SACKED_ACKED) + continue; + + if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + + if (skb_mstamp_us_delta(&tp->rack.mstamp, + &skb->skb_mstamp) <= reo_wnd) + continue; + + /* skb is lost if packet sent later is sacked */ + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (scb->sacked & TCPCB_SACKED_RETRANS) { + scb->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); + } + } else if (!(scb->sacked & TCPCB_RETRANS)) { + /* Original data are sent sequentially so stop early + * b/c the rest are all sent after rack_sent + */ + break; + } + } + return prior_retrans - tp->retrans_out; +} + /* Record the most recently (re)sent time among the (s)acked packets */ void tcp_rack_advance(struct tcp_sock *tp, const struct skb_mstamp *xmit_time, u8 sacked) |