From 85f16525a2eb66e6092cbd8dcf42371df8334ed0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 11 Jun 2013 15:35:32 -0700 Subject: tcp: properly send new data in fast recovery in first RTT Linux sends new unset data during disorder and recovery state if all (suspected) lost packets have been retransmitted ( RFC5681, section 3.2 step 1 & 2, RFC3517 section 4, NexSeg() Rule 2). One requirement is to keep the receive window about twice the estimated sender's congestion window (tcp_rcv_space_adjust()), assuming the fast retransmits repair the losses in the next round trip. But currently it's not the case on the first round trip in either normal or Fast Open connection, beucase the initial receive window is identical to (expected) sender's initial congestion window. The fix is to double it. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ec335fabd5cc..3dd46eab3b05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -181,6 +181,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } + +u32 tcp_default_init_rwnd(u32 mss) +{ + /* Initial receive window should be twice of TCP_INIT_CWND to + * enable proper sending of new unset data during fast recovery + * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a + * limit when mss is larger than 1460. + */ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > 1460) + init_rwnd = max((1460 * init_rwnd) / mss, 2U); + return init_rwnd; +} + /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -230,22 +245,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to a value enough for senders starting with - * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place - * a limit on the initial window when mss is larger than 1460. - */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = TCP_DEFAULT_INIT_RCVWND; - if (mss > 1460) - init_cwnd = - max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - /* when initializing use the value from init_rcv_wnd - * rather than the default from above - */ - if (init_rcv_wnd) - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - else - *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); } /* Set the clamp no higher than max representable value */ -- cgit v1.2.3 From 9ef71e0c820987c899e454e2e7ef94bc2d4c8d04 Mon Sep 17 00:00:00 2001 From: Weiping Pan Date: Tue, 18 Jun 2013 21:00:31 +0800 Subject: tcp:typo unset should be unsent Signed-off-by: Weiping Pan Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3dd46eab3b05..e2c1333ee318 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -185,7 +185,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) u32 tcp_default_init_rwnd(u32 mss) { /* Initial receive window should be twice of TCP_INIT_CWND to - * enable proper sending of new unset data during fast recovery + * enable proper sending of new unsent data during fast recovery * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a * limit when mss is larger than 1460. */ -- cgit v1.2.3 From bcefe17cffd06efdda3e7ad679ea743236e6271a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 15 Jun 2013 09:39:18 +0800 Subject: tcp: introduce a per-route knob for quick ack In previous discussions, I tried to find some reasonable heuristics for delayed ACK, however this seems not possible, according to Eric: "ACKS might also be delayed because of bidirectional traffic, and is more controlled by the application response time. TCP stack can not easily estimate it." "ACK can be incredibly useful to recover from losses in a short time. The vast majority of TCP sessions are small lived, and we send one ACK per received segment anyway at beginning or retransmits to let the sender smoothly increase its cwnd, so an auto-tuning facility wont help them that much." and according to David: "ACKs are the only information we have to detect loss. And, for the same reasons that TCP VEGAS is fundamentally broken, we cannot measure the pipe or some other receiver-side-visible piece of information to determine when it's "safe" to stretch ACK. And even if it's "safe", we should not do it so that losses are accurately detected and we don't spuriously retransmit. The only way to know when the bandwidth increases is to "test" it, by sending more and more packets until drops happen. That's why all successful congestion control algorithms must operate on explicited tested pieces of information. Similarly, it's not really possible to universally know if it's safe to stretch ACK or not." It still makes sense to enable or disable quick ack mode like what TCP_QUICK_ACK does. Similar to TCP_QUICK_ACK option, but for people who can't modify the source code and still wants to control TCP delayed ACK behavior. As David suggested, this should belong to per-path scope, since different pathes may want different behaviors. Cc: Eric Dumazet Cc: Rick Jones Cc: Stephen Hemminger Cc: "David S. Miller" Cc: Thomas Graf CC: David Laight Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_output.c | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7a2144e1afae..eb0f1a554d7b 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -386,6 +386,8 @@ enum { #define RTAX_RTO_MIN RTAX_RTO_MIN RTAX_INITRWND, #define RTAX_INITRWND RTAX_INITRWND + RTAX_QUICKACK, +#define RTAX_QUICKACK RTAX_QUICKACK __RTAX_MAX }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 46271cdcf088..28af45abe062 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3717,6 +3717,7 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3728,7 +3729,9 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + dst = __sk_dst_get(sk); + if (!dst || !dst_metric(dst, RTAX_QUICKACK)) + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2c1333ee318..3d609490f118 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,6 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -170,8 +171,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ -- cgit v1.2.3 From 24ab6bec80861d0c55263047e8bf97e460a32e7b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 12 Jul 2013 11:33:04 -0700 Subject: tcp: account all retransmit failures Change snmp RETRANSFAILS stat to include timeout retransmit failures in addition to other loss recoveries. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3d609490f118..92fde8d1aa82 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2407,6 +2407,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } else { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; } @@ -2528,10 +2530,9 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + if (tcp_retransmit_skb(sk, skb)) return; - } + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) -- cgit v1.2.3