diff options
author | David S. Miller <davem@davemloft.net> | 2017-01-14 06:37:18 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-01-14 06:37:18 +0300 |
commit | 718e14bb292a2e16b506133d191886110417df51 (patch) | |
tree | a889a29b8bbad8c1c64d339273bcdeea67d63299 /net/ipv4/tcp_recovery.c | |
parent | 7410191afcaca3a49bb29bfb5e15f81d7b336984 (diff) | |
parent | 94bdc9785a1136cef6a982b042719783978e8a26 (diff) | |
download | linux-718e14bb292a2e16b506133d191886110417df51.tar.xz |
Merge branch 'tcp-RACK-fast-recovery'
Yuchung Cheng says:
====================
tcp: RACK fast recovery
The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
to trigger fast recovery with a reordering timer.
Previously RACK has been running in auxiliary mode where it is
used to detect packet losses once the recovery has triggered by
other algorithms (e.g., FACK). By inspecting packet timestamps,
RACK can start ACK-driven repairs timely. A few similar heuristics
are no longer needed and are either removed or disabled to reduce
the complexity of the Linux TCP loss recovery engine:
1. FACK (Forward Acknowledgement)
2. Early Retransmit (RFC5827)
3. thin_dupack (fast recovery on single DUPACK for thin-streams)
4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
5. Forward Retransmit
After this change, Linux's loss recovery algorithms consist of
1. Conventional DUPACK threshold approach (RFC6675)
2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
3. RTO plus F-RTO extension (RFC5682)
The patch set has been tested on Google servers extensively and
presented in several IETF meetings. The data suggests that RACK
successfully improves recovery performance:
https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_recovery.c')
-rw-r--r-- | net/ipv4/tcp_recovery.c | 148 |
1 files changed, 111 insertions, 37 deletions
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index e36df4fcfeba..4ecb38ae8504 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,9 +1,32 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; -/* Marks a packet lost, if some packet sent later has been (s)acked. +static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); + } +} + +static bool tcp_rack_sent_after(const struct skb_mstamp *t1, + const struct skb_mstamp *t2, + u32 seq1, u32 seq2) +{ + return skb_mstamp_after(t1, t2) || + (t1->v64 == t2->v64 && after(seq1, seq2)); +} + +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ -int tcp_rack_mark_lost(struct sock *sk) +static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, + u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - u32 reo_wnd, prior_retrans = tp->retrans_out; - - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) - return 0; - - /* Reset the advanced flag to avoid unnecessary queue scanning */ - tp->rack.advanced = 0; + u32 reo_wnd; + *reo_timeout = 0; /* To be more reordering resilient, allow min_rtt/4 settling delay * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * RTT because reordering is often a path property and less related * to queuing or delayed ACKs. - * - * TODO: measure and adapt to the observed reordering delay, and - * use a timer to retransmit like the delayed early retransmit. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) { + /* Step 3 in draft-cheng-tcpm-rack-00.txt: + * A packet is lost if its elapsed time is beyond + * the recent RTT plus the reordering window. + */ + u32 elapsed = skb_mstamp_us_delta(now, + &skb->skb_mstamp); + s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - if (skb_mstamp_us_delta(&tp->rack.mstamp, - &skb->skb_mstamp) <= reo_wnd) + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); continue; - - /* skb is lost if packet sent later is sacked */ - tcp_skb_mark_lost_uncond_verify(tp, skb); - if (scb->sacked & TCPCB_SACKED_RETRANS) { - scb->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); } + + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; + + /* Record maximum wait time (+1 to avoid 0) */ + *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early * b/c the rest are all sent after rack_sent @@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk) break; } } - return prior_retrans - tp->retrans_out; } -/* Record the most recently (re)sent time among the (s)acked packets */ -void tcp_rack_advance(struct tcp_sock *tp, - const struct skb_mstamp *xmit_time, u8 sacked) +void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout; + + if (!tp->rack.advanced) + return; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + tcp_rack_detect_loss(sk, now, &timeout); + if (timeout) { + timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, + timeout, inet_csk(sk)->icsk_rto); + } +} + +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + const struct skb_mstamp *xmit_time, + const struct skb_mstamp *ack_time) { + u32 rtt_us; + if (tp->rack.mstamp.v64 && - !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + end_seq, tp->rack.end_seq)) return; + rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); if (sacked & TCPCB_RETRANS) { - struct skb_mstamp now; - /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - skb_mstamp_get(&now); - if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + if (rtt_us < tcp_min_rtt(tp)) return; } - + tp->rack.rtt_us = rtt_us; tp->rack.mstamp = *xmit_time; + tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } + +/* We have waited long enough to accommodate reordering. Mark the expired + * packets lost and retransmit them. + */ +void tcp_rack_reo_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + u32 timeout, prior_inflight; + + skb_mstamp_get(&now); + prior_inflight = tcp_packets_in_flight(tp); + tcp_rack_detect_loss(sk, &now, &timeout); + if (prior_inflight != tcp_packets_in_flight(tp)) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { + tcp_enter_recovery(sk, false); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_cwnd_reduction(sk, 1, 0); + } + tcp_xmit_retransmit_queue(sk); + } + if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) + tcp_rearm_rto(sk); +} |