diff options
Diffstat (limited to 'net/ipv4/tcp_bbr.c')
-rw-r--r-- | net/ipv4/tcp_bbr.c | 92 |
1 files changed, 60 insertions, 32 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index b89bce4c721e..69ee877574d0 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -52,10 +52,9 @@ * There is a public e-mail list for discussing BBR development and testing: * https://groups.google.com/forum/#!forum/bbr-dev * - * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, - * since pacing is integral to the BBR design and implementation. - * BBR without pacing would not function properly, and may incur unnecessary - * high packet loss rates. + * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, + * otherwise TCP stack falls back to an internal pacing using one high + * resolution timer per TCP socket and may use more resources. */ #include <linux/module.h> #include <net/tcp.h> @@ -92,7 +91,7 @@ struct bbr { struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ u32 rtt_cnt; /* count of packet-timed rounds elapsed */ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ - struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u64 cycle_mstamp; /* time of this cycle phase start */ u32 mode:3, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ packet_conservation:1, /* use packet conservation? */ @@ -113,7 +112,8 @@ struct bbr { cwnd_gain:10, /* current gain for setting cwnd */ full_bw_cnt:3, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ - unused_b:6; + has_seen_rtt:1, /* have we seen an RTT sample yet? */ + unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ }; @@ -212,6 +212,35 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) return rate >> BW_SCALE; } +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -221,12 +250,13 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { + struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 rate = bw; + u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); - rate = bbr_rate_bytes_per_sec(sk, rate, gain); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); - if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } @@ -412,7 +442,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); bool is_full_length = - skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > bbr->min_rtt_us; u32 inflight, bw; @@ -498,7 +528,7 @@ static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); bbr->lt_last_delivered = tp->delivered; bbr->lt_last_lost = tp->lost; bbr->lt_rtt_cnt = 0; @@ -552,7 +582,7 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) struct bbr *bbr = inet_csk_ca(sk); u32 lost, delivered; u64 bw; - s32 t; + u32 t; if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ if (bbr->mode == BBR_PROBE_BW && bbr->round_start && @@ -604,15 +634,15 @@ static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) return; /* Find average delivery rate in this sampling interval. */ - t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); - if (t < 1) - return; /* interval is less than one jiffy, so wait */ - t = jiffies_to_usecs(t); - /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ - if (t < 1) { + t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; + if ((s32)t < 1) + return; /* interval is less than one ms, so wait */ + /* Check if can multiply without overflow */ + if (t >= ~0U / USEC_PER_MSEC) { bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ return; } + t *= USEC_PER_MSEC; bw = (u64)delivered * BW_UNIT; do_div(bw, t); bbr_lt_bw_interval_done(sk, bw); @@ -731,12 +761,12 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bool filter_expired; /* Track min RTT seen in the min_rtt_win_sec filter window: */ - filter_expired = after(tcp_time_stamp, + filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { bbr->min_rtt_us = rs->rtt_us; - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; } if (bbr_probe_rtt_mode_ms > 0 && filter_expired && @@ -755,7 +785,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { - bbr->probe_rtt_done_stamp = tcp_time_stamp + + bbr->probe_rtt_done_stamp = tcp_jiffies32 + msecs_to_jiffies(bbr_probe_rtt_mode_ms); bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; @@ -763,8 +793,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr->round_start) bbr->probe_rtt_round_done = 1; if (bbr->probe_rtt_round_done && - after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { - bbr->min_rtt_stamp = tcp_time_stamp; + after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_jiffies32; bbr->restore_cwnd = 1; /* snap to prior_cwnd */ bbr_reset_mode(sk); } @@ -799,7 +829,6 @@ static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw; bbr->prior_cwnd = 0; bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ @@ -811,25 +840,24 @@ static void bbr_init(struct sock *sk) bbr->probe_rtt_done_stamp = 0; bbr->probe_rtt_round_done = 0; bbr->min_rtt_us = tcp_min_rtt(tp); - bbr->min_rtt_stamp = tcp_time_stamp; + bbr->min_rtt_stamp = tcp_jiffies32; minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ - bw = (u64)tp->snd_cwnd * BW_UNIT; - do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); - sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ - bbr_set_pacing_rate(sk, bw, bbr_high_gain); + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; - bbr->cycle_mstamp.v64 = 0; + bbr->cycle_mstamp = 0; bbr->cycle_idx = 0; bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } static u32 bbr_sndbuf_expand(struct sock *sk) |