From 5ffc02a158997b1eb91ade8d02bcf521ff79a218 Mon Sep 17 00:00:00 2001 From: Satoru SATOH Date: Sun, 4 May 2008 22:14:42 -0700 Subject: ip: Use inline function dst_metric() instead of direct access to dst->metric[] There are functions to refer to the value of dst->metric[THE_METRIC-1] directly without use of a inline function "dst_metric" defined in net/dst.h. The following patch changes them to use the inline function consistently. Signed-off-by: Satoru SATOH Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eda4f4a233f3..8ac15a604e08 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -605,7 +606,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst->metrics[RTAX_RTO_MIN - 1]; + rto_min = dst_metric(dst, RTAX_RTO_MIN); return rto_min; } @@ -769,7 +770,7 @@ void tcp_update_metrics(struct sock *sk) dst->metrics[RTAX_RTTVAR - 1] = m; else dst->metrics[RTAX_RTTVAR-1] -= - (dst->metrics[RTAX_RTTVAR-1] - m)>>2; + (dst_metric(dst, RTAX_RTTVAR) - m)>>2; } if (tp->snd_ssthresh >= 0xFFFF) { @@ -788,21 +789,21 @@ void tcp_update_metrics(struct sock *sk) dst->metrics[RTAX_SSTHRESH-1] = max(tp->snd_cwnd >> 1, tp->snd_ssthresh); if (!dst_metric_locked(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; + dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; } else { /* Else slow start did not finish, cwnd is non-sense, ssthresh may be also invalid. */ if (!dst_metric_locked(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; - if (dst->metrics[RTAX_SSTHRESH-1] && + dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; + if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && - tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) + tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; } if (!dst_metric_locked(dst, RTAX_REORDERING)) { - if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && + if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && tp->reordering != sysctl_tcp_reordering) dst->metrics[RTAX_REORDERING-1] = tp->reordering; } -- cgit v1.2.3 From 62ab22278308a40bcb7f4079e9719ab8b7fe11b5 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 8 May 2008 01:09:11 -0700 Subject: tcp FRTO: SACK variant is errorneously used with NewReno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: there's actually another bug in FRTO's SACK variant, which is the causing failure in NewReno case because of the error that's fixed here. I'll fix the SACK case separately (it's a separate bug really, though related, but in order to fix that I need to audit tp->snd_nxt usage a bit). There were two places where SACK variant of FRTO is getting incorrectly used even if SACK wasn't negotiated by the TCP flow. This leads to incorrect setting of frto_highmark with NewReno if a previous recovery was interrupted by another RTO. An eventual fallback to conventional recovery then incorrectly considers one or couple of segments as forward transmissions though they weren't, which then are not LOST marked during fallback making them "non-retransmittable" until the next RTO. In a bad case, those segments are really lost and are the only one left in the window. Thus TCP needs another RTO to continue. The next FRTO, however, could again repeat the same events making the progress of the TCP flow extremely slow. In order for these events to occur at all, FRTO must occur again in FRTOs step 3 while the key segments must be lost as well, which is not too likely in practice. It seems to most frequently with some small devices such as network printers that *seem* to accept TCP segments only in-order. In cases were key segments weren't lost, things get automatically resolved because those wrongly marked segments don't need to be retransmitted in order to continue. I found a reproducer after digging up relevant reports (few reports in total, none at netdev or lkml I know of), some cases seemed to indicate middlebox issues which seems now to be a false assumption some people had made. Bugzilla #10063 _might_ be related. Damon L. Chesser had a reproducable case and was kind enough to tcpdump it for me. With the tcpdump log it was quite trivial to figure out. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8ac15a604e08..26c936930e92 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -114,8 +114,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) -#define IsSackFrto() (sysctl_tcp_frto == 0x2) - #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -1686,6 +1684,11 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } +static int tcp_is_sackfrto(const struct tcp_sock *tp) +{ + return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); +} + /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ @@ -1702,7 +1705,7 @@ int tcp_use_frto(struct sock *sk) if (icsk->icsk_mtup.probe_size) return 0; - if (IsSackFrto()) + if (tcp_is_sackfrto(tp)) return 1; /* Avoid expensive walking of rexmit queue if possible */ @@ -1792,7 +1795,7 @@ void tcp_enter_frto(struct sock *sk) /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. */ - if (IsSackFrto() && (tp->frto_counter || + if (tcp_is_sackfrto(tp) && (tp->frto_counter || ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && after(tp->high_seq, tp->snd_una)) { tp->frto_highmark = tp->high_seq; @@ -3124,7 +3127,7 @@ static int tcp_process_frto(struct sock *sk, int flag) return 1; } - if (!IsSackFrto() || tcp_is_reno(tp)) { + if (!tcp_is_sackfrto(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate -- cgit v1.2.3 From a1c1f281b84a751fdb5ff919da3b09df7297619f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 13 May 2008 02:53:26 -0700 Subject: tcp FRTO: Fix fallback to conventional recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that commit 009a2e3e4ec ("[TCP] FRTO: Improve interoperability with other undo_marker users") run into another land-mine which caused fallback to conventional recovery to break: 1. Cumulative ACK arrives after FRTO retransmission 2. tcp_try_to_open sees zero retrans_out, clears retrans_stamp which should be kept like in CA_Loss state it would be 3. undo_marker change allowed tcp_packet_delayed to return true because of the cleared retrans_stamp once FRTO is terminated causing LossUndo to occur, which means all loss markings FRTO made are reverted. This means that the conventional recovery basically recovered one loss per RTT, which is not that efficient. It was quite unobvious that the undo_marker change broken something like this, I had a quite long session to track it down because of the non-intuitiviness of the bug (luckily I had a trivial reproducer at hand and I was also able to learn to use kprobes in the process as well :-)). This together with the NewReno+FRTO fix and FRTO in-order workaround this fixes Damon's problems, this and the first mentioned are enough to fix Bugzilla #10063. Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Tested-by: Sebastian Hyrwall Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 26c936930e92..d6edb98fd526 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2482,7 +2482,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_verify_left_out(tp); - if (tp->retrans_out == 0) + if (!tp->frto_counter && tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag & FLAG_ECE) -- cgit v1.2.3 From 79d44516b4b178ffb6e2159c75584cfcfc097914 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 13 May 2008 02:54:19 -0700 Subject: tcp FRTO: work-around inorder receivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If receiver consumes segments successfully only in-order, FRTO fallback to conventional recovery produces RTO loop because FRTO's forward transmissions will always get dropped and need to be resent, yet by default they're not marked as lost (which are the only segments we will retransmit in CA_Loss). Price to pay about this is occassionally unnecessarily retransmitting the forward transmission(s). SACK blocks help a bit to avoid this, so it's mainly a concern for NewReno case though SACK is not fully immune either. This change has a side-effect of fixing SACKFRTO problem where it didn't have snd_nxt of the RTO time available anymore when fallback become necessary (this problem would have only occured when RTO would occur for two or more segments and ECE arrives in step 3; no need to figure out how to fix that unless the TODO item of selective behavior is considered in future). Signed-off-by: Ilpo Järvinen Reported-by: Damon L. Chesser Tested-by: Damon L. Chesser Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6edb98fd526..b54d9d37b636 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1842,9 +1842,16 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } - /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && - !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + /* Marking forward transmissions that were made after RTO lost + * can cause unnecessary retransmissions in some scenarios, + * SACK blocks will mitigate that in some but not in all cases. + * We used to not mark them but it was causing break-ups with + * receivers that do only in-order receival. + * + * TODO: we could detect presence of such receiver and select + * different behavior per flow. + */ + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); } @@ -1860,7 +1867,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); - tp->high_seq = tp->frto_highmark; + tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); tcp_clear_retrans_hints_partial(tp); -- cgit v1.2.3