summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-05-17 22:41:30 +0300
committerDavid S. Miller <davem@davemloft.net>2018-05-17 22:41:30 +0300
commit10e361e1006eec2b4b369481ee1509c58004bda8 (patch)
treec543d69724d87f1128c00ce80843727902515fdf /net/ipv4/tcp_input.c
parent9611d6d6e24cd40ff887bdbb4dfe36a2ee88d488 (diff)
parent56f8c5d78f5746f48abf6d1f0bc2945dc59277ee (diff)
downloadlinux-10e361e1006eec2b4b369481ee1509c58004bda8.tar.xz
Merge branch 'tcp-default-RACK-loss-recovery'
Yuchung Cheng says: ==================== tcp: default RACK loss recovery This patch set implements the features correspond to the draft-ietf-tcpm-rack-03 version of the RACK draft. https://datatracker.ietf.org/meeting/101/materials/slides-101-tcpm-update-on-tcp-rack-00 1. SACK: implement equivalent DUPACK threshold heuristic in RACK to replace existing RFC6675 recovery (tcp_mark_head_lost). 2. Non-SACK: simplify RFC6582 NewReno implementation 3. RTO: apply RACK's time-based approach to avoid spuriouly marking very recently sent packets lost. 4. with (1)(2)(3), make RACK the exclusive fast recovery mechanism to mark losses based on time on S/ACK. Tail loss probe and F-RTO remain enabled by default as complementary mechanisms to send probes in CA_Open and CA_Loss states. The probes would solicit S/ACKs to trigger RACK time-based loss detection. All Google web and internal servers have been running RACK-only mode (4) for a while now. a/b experiments indicate RACK/TLP on average reduces recovery latency by 10% compared to RFC6675. RFC6675 is default-off now but can be enabled by disabling RACK (sysctl net.ipv4.tcp_recovery=0) for unseen issues. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c99
1 files changed, 53 insertions, 46 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b188e0d75edd..0bf032839548 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,19 +1917,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
tp->undo_retrans = tp->retrans_out ? : -1;
}
-/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+static bool tcp_is_rack(const struct sock *sk)
+{
+ return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
+/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb, *head;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ head = tcp_rtx_queue_head(sk);
+ is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ /* Mark SACK reneging until we recover from this loss event. */
+ tp->is_sack_reneg = 1;
+ } else if (tcp_is_reno(tp)) {
+ tcp_reset_reno_sack(tp);
+ }
+
+ skb = head;
+ skb_rbtree_walk_from(skb) {
+ if (is_reneg)
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ else if (tcp_is_rack(sk) && skb != head &&
+ tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ continue; /* Don't mark recently sent ones lost yet */
+ tcp_mark_skb_lost(sk, skb);
+ }
+ tcp_verify_left_out(tp);
+ tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
- bool is_reneg; /* is receiver reneging on SACKs? */
- bool mark_lost;
+
+ tcp_timeout_mark_lost(sk);
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1941,40 +1976,10 @@ void tcp_enter_loss(struct sock *sk)
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
- tp->snd_cwnd = 1;
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
- tp->retrans_out = 0;
- tp->lost_out = 0;
-
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- skb = tcp_rtx_queue_head(sk);
- is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
- if (is_reneg) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
- tp->sacked_out = 0;
- /* Mark SACK reneging until we recover from this loss event. */
- tp->is_sack_reneg = 1;
- }
- tcp_clear_all_retrans_hints(tp);
-
- skb_rbtree_walk_from(skb) {
- mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
- is_reneg);
- if (mark_lost)
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (mark_lost) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- }
- tcp_verify_left_out(tp);
-
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
@@ -2141,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
return true;
/* Not-A-Trick#2 : Classic rule... */
- if (tcp_dupack_heuristics(tp) > tp->reordering)
+ if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
return true;
return false;
@@ -2218,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1, 1);
- } else {
+ if (tcp_is_sack(tp)) {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2718,12 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
return false;
}
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- /* Use RACK to detect loss */
- if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ if (tcp_rtx_queue_empty(sk))
+ return;
+
+ if (unlikely(tcp_is_reno(tp))) {
+ tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+ } else if (tcp_is_rack(sk)) {
u32 prior_retrans = tp->retrans_out;
tcp_rack_mark_lost(sk);
@@ -2819,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_try_keep_open(sk);
return;
}
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
@@ -2840,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- tcp_rack_identify_loss(sk, ack_flag);
+ tcp_identify_packet_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -2862,7 +2869,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
fast_rexmit = 1;
}
- if (do_lost)
+ if (!tcp_is_rack(sk) && do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}