From 1b7fdd2ab5852717a4fc7d79847759c67065d7e9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 30 Aug 2013 08:35:53 -0700 Subject: tcp: do not use cached RTT for RTT estimation RTT cached in the TCP metrics are valuable for the initial timeout because SYN RTT usually does not account for serialization delays on low BW path. However using it to seed the RTT estimator maybe disruptive because other components (e.g., pacing) require the smooth RTT to be obtained from actual connection. The solution is to use the higher cached RTT to set the first RTO conservatively like tcp_rtt_estimator(), but avoid seeding the other RTT estimator variables such as srtt. It is also a good idea to keep RTO conservative to obtain the first RTT sample, and the performance is insured by TCP loss probe if SYN RTT is available. To keep the seeding formula consistent across SYN RTT and cached RTT, the rttvar is twice the cached RTT instead of cached RTTVAR value. The reason is because cached variation may be too small (near min RTO) which defeats the purpose of being conservative on first RTO. However the metrics still keep the RTT variations as they might be useful for user applications (through ip). Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f6a005c485a9..273ed735cca2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk) struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_metrics_block *tm; - u32 val; + u32 val, crtt = 0; /* cached RTT scaled by 8 */ if (dst == NULL) goto reset; @@ -478,40 +478,18 @@ void tcp_init_metrics(struct sock *sk) tp->reordering = val; } - val = tcp_metric_get(tm, TCP_METRIC_RTT); - if (val == 0 || tp->srtt == 0) { - rcu_read_unlock(); - goto reset; - } - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. - * - * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal circumstances sending small - * packets force peer to delay ACKs and calculation is correct too. - * The algorithm is adaptive and, provided we follow specs, it - * NEVER underestimate RTT. BUT! If peer tries to make some clever - * tricks sort of "quick acks" for time long enough to decrease RTT - * to low value, and then abruptly stops to do it and starts to delay - * ACKs, wait for troubles. - */ - val = msecs_to_jiffies(val); - if (val > tp->srtt) { - tp->srtt = val; - tp->rtt_seq = tp->snd_nxt; - } - val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); - if (val > tp->mdev) { - tp->mdev = val; - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } + crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); rcu_read_unlock(); - - tcp_set_rto(sk); reset: - if (tp->srtt == 0) { + if (crtt > tp->srtt) { + /* Initial RTT (tp->srtt) from SYN usually don't measure + * serialization delay on low BW links well so RTO may be + * under-estimated. Stay conservative and seed RTO with + * the RTTs from past data exchanges, using the same seeding + * formula in tcp_rtt_estimator(). + */ + inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk)); + } else if (tp->srtt == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs -- cgit v1.2.3 From 52f20e655d9f6f7f937a1cdacf219d9df3ab6166 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 3 Sep 2013 14:14:35 -0700 Subject: tcp: better comments for RTO initiallization Commit 1b7fdd2ab585("tcp: do not use cached RTT for RTT estimation") removes important comments on how RTO is initialized and updated. Hopefully this patch puts those information back. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 273ed735cca2..4a22f3e715df 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -481,13 +481,27 @@ void tcp_init_metrics(struct sock *sk) crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: + /* The initial RTT measurement from the SYN/SYN-ACK is not ideal + * to seed the RTO for later data packets because SYN packets are + * small. Use the per-dst cached values to seed the RTO but keep + * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). + * Later the RTO will be updated immediately upon obtaining the first + * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only + * influences the first RTO but not later RTT estimation. + * + * But if RTT is not available from the SYN (due to retransmits or + * syn cookies) or the cache, force a conservative 3secs timeout. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ if (crtt > tp->srtt) { - /* Initial RTT (tp->srtt) from SYN usually don't measure - * serialization delay on low BW links well so RTO may be - * under-estimated. Stay conservative and seed RTO with - * the RTTs from past data exchanges, using the same seeding - * formula in tcp_rtt_estimator(). - */ inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk)); } else if (tp->srtt == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from -- cgit v1.2.3 From 269aa759b474570fa642452742741525cfc226a9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 16 Sep 2013 21:44:20 -0400 Subject: tcp: fix RTO calculated from cached RTT Commit 1b7fdd2ab5852 ("tcp: do not use cached RTT for RTT estimation") did not correctly account for the fact that crtt is the RTT shifted left 3 bits. Fix the calculation to consistently reflect this fact. Signed-off-by: Neal Cardwell Cc: Eric Dumazet Cc: Yuchung Cheng Acked-by: Eric Dumazet Acked-By: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4a22f3e715df..52f3c6b971d2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -502,7 +502,9 @@ reset: * ACKs, wait for troubles. */ if (crtt > tp->srtt) { - inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk)); + /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ + crtt >>= 3; + inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, -- cgit v1.2.3