From f672258391b42a5c7cc2732c9c063e56a85c8dbe Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:42 -0700 Subject: tcp: track min RTT using windowed min-filter Kathleen Nichols' algorithm for tracking the minimum RTT of a data stream over some measurement window. It uses constant space and constant time per update. Yet it almost always delivers the same minimum as an implementation that has to keep all the data in the window. The measurement window is tunable via sysctl.net.ipv4.tcp_min_rtt_wlen with a default value of 5 minutes. The algorithm keeps track of the best, 2nd best & 3rd best min values, maintaining an invariant that the measurement time of the n'th best >= n-1'th best. It also makes sure that the three values are widely separated in the time window since that bounds the worse case error when that data is monotonically increasing over the window. Upon getting a new min, we can forget everything earlier because it has no value - the new min is less than everything else in the window by definition and it's the most recent. So we restart fresh on every new min and overwrites the 2nd & 3rd choices. The same property holds for the 2nd & 3rd best. Therefore we have to maintain two invariants to maximize the information in the samples, one on values (1st.v <= 2nd.v <= 3rd.v) and the other on times (now-win <=1st.t <= 2nd.t <= 3rd.t <= now). These invariants determine the structure of the code The RTT input to the windowed filter is the minimum RTT measured from ACK or SACK, or as the last resort from TCP timestamps. The accessor tcp_min_rtt() returns the minimum RTT seen in the window. ~0U indicates it is not available. The minimum is 1usec even if the true RTT is below that. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 86a7edaa6797..90edef5508f9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -217,6 +217,9 @@ struct tcp_sock { u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ + struct rtt_meas { + u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */ + } rtt_min[3]; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ -- cgit v1.2.3 From af82f4e84866ecd360a53f770d6217637116e6c1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:43 -0700 Subject: tcp: remove tcp_mark_lost_retrans() Remove the existing lost retransmit detection because RACK subsumes it completely. This also stops the overloading the ack_seq field of the skb control block. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 -- net/ipv4/tcp_input.c | 65 --------------------------------------------------- net/ipv4/tcp_output.c | 6 ----- 3 files changed, 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90edef5508f9..8c54863dfc38 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -283,8 +283,6 @@ struct tcp_sock { int lost_cnt_hint; u32 retransmit_high; /* L-bits may be on up to this seqno */ - u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ - u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eedb25db3947..5a776897a8c7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1048,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, return !before(start_seq, end_seq - tp->max_window); } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk, int *flag) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt = 0; - u32 new_low_seq = tp->snd_nxt; - u32 received_upto = tcp_highest_sack_seq(tp); - - if (!tcp_is_fack(tp) || !tp->retrans_out || - !after(received_upto, tp->lost_retrans_low) || - icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - tcp_for_write_queue(skb, sk) { - u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - - if (skb == tcp_send_head(sk)) - break; - if (cnt == tp->retrans_out) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) - continue; - - /* TODO: We would like to get rid of tcp_is_fack(tp) only - * constraint here (see above) but figuring out that at - * least tp->reordering SACK blocks reside between ack_seq - * and received_upto is not easy task to do cheaply with - * the available datastructures. - * - * Whether FACK should check here for tp->reordering segs - * in-between one could argue for either way (it would be - * rather simple to implement as we could count fack_count - * during the walk and do tp->fackets_out - fack_count). - */ - if (after(received_upto, ack_seq)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - *flag |= FLAG_LOST_RETRANS; - tcp_skb_mark_lost_uncond_verify(tp, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); - } else { - if (before(ack_seq, new_low_seq)) - new_low_seq = ack_seq; - cnt += tcp_skb_pcount(skb); - } - } - - if (tp->retrans_out) - tp->lost_retrans_low = new_low_seq; -} - static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1838,7 +1774,6 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 19adedb8c5cc..f6f7f9b4901b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } -- cgit v1.2.3 From 625a5e109a3ed6f36a1008a43069a3462b44a424 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:45 -0700 Subject: tcp: skb_mstamp_after helper a helper to prepare the first main RACK patch. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4398411236f1..24f4dfd94c51 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, return delta_us; } +static inline bool skb_mstamp_after(const struct skb_mstamp *t1, + const struct skb_mstamp *t0) +{ + s32 diff = t1->stamp_jiffies - t0->stamp_jiffies; + + if (!diff) + diff = t1->stamp_us - t0->stamp_us; + return diff > 0; +} /** * struct sk_buff - socket buffer -- cgit v1.2.3 From 659a8ad56f490279f0efee43a62ffa1ac914a4e0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 16 Oct 2015 21:57:46 -0700 Subject: tcp: track the packet timings in RACK This patch is the first half of the RACK loss recovery. RACK loss recovery uses the notion of time instead of packet sequence (FACK) or counts (dupthresh). It's inspired by the previous FACK heuristic in tcp_mark_lost_retrans(): when a limited transmit (new data packet) is sacked, then current retransmitted sequence below the newly sacked sequence must been lost, since at least one round trip time has elapsed. But it has several limitations: 1) can't detect tail drops since it depends on limited transmit 2) is disabled upon reordering (assumes no reordering) 3) only enabled in fast recovery ut not timeout recovery RACK (Recently ACK) addresses these limitations with the notion of time instead: a packet P1 is lost if a later packet P2 is s/acked, as at least one round trip has passed. Since RACK cares about the time sequence instead of the data sequence of packets, it can detect tail drops when later retransmission is s/acked while FACK or dupthresh can't. For reordering RACK uses a dynamically adjusted reordering window ("reo_wnd") to reduce false positives on ever (small) degree of reordering. This patch implements tcp_advanced_rack() which tracks the most recent transmission time among the packets that have been delivered (ACKed or SACKed) in tp->rack.mstamp. This timestamp is the key to determine which packet has been lost. Consider an example that the sender sends six packets: T1: P1 (lost) T2: P2 T3: P3 T4: P4 T100: sack of P2. rack.mstamp = T2 T101: retransmit P1 T102: sack of P2,P3,P4. rack.mstamp = T4 T205: ACK of P4 since the hole is repaired. rack.mstamp = T101 We need to be careful about spurious retransmission because it may falsely advance tp->rack.mstamp by an RTT or an RTO, causing RACK to falsely mark all packets lost, just like a spurious timeout. We identify spurious retransmission by the ACK's TS echo value. If TS option is not applicable but the retransmission is acknowledged less than min-RTT ago, it is likely to be spurious. We refrain from using the transmission time of these spurious retransmissions. The second half is implemented in the next patch that marks packet lost using RACK timestamp. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++++++ include/net/tcp.h | 5 +++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_input.c | 14 ++++++++++++++ net/ipv4/tcp_minisocks.c | 2 ++ net/ipv4/tcp_recovery.c | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 60 insertions(+) create mode 100644 net/ipv4/tcp_recovery.c (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8c54863dfc38..5dce9705fe84 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -194,6 +194,12 @@ struct tcp_sock { u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ + /* Information of the most recently (s)acked skb */ + struct tcp_rack { + struct skb_mstamp mstamp; /* (Re)sent time of the skb */ + u8 advanced; /* mstamp advanced since last lost marking */ + u8 reord; /* reordering detected */ + } rack; u16 advmss; /* Advertised MSS */ u8 unused; u8 nonagle : 4,/* Disable Nagle algorithm? */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a43152229ea..3c3a9fe057d3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1750,6 +1750,11 @@ int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); +/* tcp_recovery.c */ + +extern void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked); + /* * Save and compile IPv4 options, return a pointer to it */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89aacb630a53..c29809f765dc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ + tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1e97e73e5ecf..ce8370525832 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1173,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { + tcp_rack_advance(tp, xmit_time, sacked); + if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing @@ -2256,6 +2258,16 @@ static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) before(tp->rx_opt.rcv_tsecr, when); } +/* skb is spurious retransmitted if the returned timestamp echo + * reply is prior to the skb transmission time + */ +static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, + const struct sk_buff *skb) +{ + return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); +} + /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ @@ -3135,6 +3147,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; + else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b875c288daaa..1fd5d413a664 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->rack.mstamp.v64 = 0; + newtp->rack.advanced = 0; newtp->saved_syn = req->saved_syn; req->saved_syn = NULL; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c new file mode 100644 index 000000000000..8f66a6584845 --- /dev/null +++ b/net/ipv4/tcp_recovery.c @@ -0,0 +1,32 @@ +#include +#include + +/* Record the most recently (re)sent time among the (s)acked packets */ +void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked) +{ + if (tp->rack.mstamp.v64 && + !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + return; + + if (sacked & TCPCB_RETRANS) { + struct skb_mstamp now; + + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + skb_mstamp_get(&now); + if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + return; + } + + tp->rack.mstamp = *xmit_time; + tp->rack.advanced = 1; +} -- cgit v1.2.3