diff options
author | David S. Miller <davem@davemloft.net> | 2017-10-06 07:24:48 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-10-06 07:24:48 +0300 |
commit | cec451ce60e50dba6d4136b7d1e62a5900cd264f (patch) | |
tree | cb70c1552a2c58cc5b8f63bd8e85b7fdc3b33497 /net/ipv4/tcp_recovery.c | |
parent | b1fb67fa501c4787035317f84db6caf013385581 (diff) | |
parent | bef06223083b81d2064824afe2bc85be416ab73a (diff) | |
download | linux-cec451ce60e50dba6d4136b7d1e62a5900cd264f.tar.xz |
Merge branch 'tcp-improving-RACK-cpu-performance'
Yuchung Cheng says:
====================
tcp: improving RACK cpu performance
This patch set improves the CPU consumption of the RACK TCP loss
recovery algorithm, in particular for high-speed networks. Currently,
for every ACK in recovery RACK can potentially iterate over all sent
packets in the write queue. On large BDP networks with non-trivial
losses the RACK write queue walk CPU usage becomes unreasonably high.
This patch introduces a new queue in TCP that keeps only skbs sent and
not yet (s)acked or marked lost, in time order instead of sequence
order. With that, RACK can examine this time-sorted list and only
check packets that were sent recently, within the reordering window,
per ACK. This is the fastest way without any write queue walks. The
number of skbs examined per ACK is reduced by orders of magnitude.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_recovery.c')
-rw-r--r-- | net/ipv4/tcp_recovery.c | 52 |
1 files changed, 19 insertions, 33 deletions
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..cda6074a429a 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } |