diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 08:47:44 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 08:47:44 +0300 |
commit | 50b825d7e87f4cff7070df6eb26390152bb29537 (patch) | |
tree | ec82aba49ab0c4743266ff37e18c8304a0367d06 /net/ipv4/tcp_bbr.c | |
parent | a97a2d4d56ea596871b739d63d41b084733bd9fb (diff) | |
parent | 3f80e08f40cdb308589a49077c87632fa4508b21 (diff) | |
download | linux-50b825d7e87f4cff7070df6eb26390152bb29537.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Add VF IPSEC offload support in ixgbe, from Shannon Nelson.
2) Add zero-copy AF_XDP support to i40e, from Björn Töpel.
3) All in-tree drivers are converted to {g,s}et_link_ksettings() so we
can get rid of the {g,s}et_settings ethtool callbacks, from Michal
Kubecek.
4) Add software timestamping to veth driver, from Michael Walle.
5) More work to make packet classifiers and actions lockless, from Vlad
Buslov.
6) Support sticky FDB entries in bridge, from Nikolay Aleksandrov.
7) Add ipv6 version of IP_MULTICAST_ALL sockopt, from Andre Naujoks.
8) Support batching of XDP buffers in vhost_net, from Jason Wang.
9) Add flow dissector BPF hook, from Petar Penkov.
10) i40e vf --> generic iavf conversion, from Jesse Brandeburg.
11) Add NLA_REJECT netlink attribute policy type, to signal when users
provide attributes in situations which don't make sense. From
Johannes Berg.
12) Switch TCP and fair-queue scheduler over to earliest departure time
model. From Eric Dumazet.
13) Improve guest receive performance by doing rx busy polling in tx
path of vhost networking driver, from Tonghao Zhang.
14) Add per-cgroup local storage to bpf
15) Add reference tracking to BPF, from Joe Stringer. The verifier can
now make sure that references taken to objects are properly released
by the program.
16) Support in-place encryption in TLS, from Vakul Garg.
17) Add new taprio packet scheduler, from Vinicius Costa Gomes.
18) Lots of selftests additions, too numerous to mention one by one here
but all of which are very much appreciated.
19) Support offloading of eBPF programs containing BPF to BPF calls in
nfp driver, frm Quentin Monnet.
20) Move dpaa2_ptp driver out of staging, from Yangbo Lu.
21) Lots of u32 classifier cleanups and simplifications, from Al Viro.
22) Add new strict versions of netlink message parsers, and enable them
for some situations. From David Ahern.
23) Evict neighbour entries on carrier down, also from David Ahern.
24) Support BPF sk_msg verdict programs with kTLS, from Daniel Borkmann
and John Fastabend.
25) Add support for filtering route dumps, from David Ahern.
26) New igc Intel driver for 2.5G parts, from Sasha Neftin et al.
27) Allow vxlan enslavement to bridges in mlxsw driver, from Ido
Schimmel.
28) Add queue and stack map types to eBPF, from Mauricio Vasquez B.
29) Add back byte-queue-limit support to r8169, with all the bug fixes
in other areas of the driver it works now! From Florian Westphal and
Heiner Kallweit.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2147 commits)
tcp: add tcp_reset_xmit_timer() helper
qed: Fix static checker warning
Revert "be2net: remove desc field from be_eq_obj"
Revert "net: simplify sock_poll_wait"
net: socionext: Reset tx queue in ndo_stop
net: socionext: Add dummy PHY register read in phy_write()
net: socionext: Stop PHY before resetting netsec
net: stmmac: Set OWN bit for jumbo frames
arm64: dts: stratix10: Support Ethernet Jumbo frame
tls: Add maintainers
net: ethernet: ti: cpsw: unsync mcast entries while switch promisc mode
octeontx2-af: Support for NIXLF's UCAST/PROMISC/ALLMULTI modes
octeontx2-af: Support for setting MAC address
octeontx2-af: Support for changing RSS algorithm
octeontx2-af: NIX Rx flowkey configuration for RSS
octeontx2-af: Install ucast and bcast pkt forwarding rules
octeontx2-af: Add LMAC channel info to NIXLF_ALLOC response
octeontx2-af: NPC MCAM and LDATA extract minimal configuration
octeontx2-af: Enable packet length and csum validation
octeontx2-af: Support for VTAG strip and capture
...
Diffstat (limited to 'net/ipv4/tcp_bbr.c')
-rw-r--r-- | net/ipv4/tcp_bbr.c | 90 |
1 files changed, 72 insertions, 18 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 02ff2dde9609..9277abdd822a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +static const int bbr_pacing_margin_percent = 1; + /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting @@ -208,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { unsigned int mss = tcp_sk(sk)->mss_cache; - if (!tcp_needs_internal_pacing(sk)) - mss = tcp_mss_to_mtu(sk, mss); rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC; + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); return rate >> BW_SCALE; } /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) { u64 rate = bw; @@ -257,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); @@ -279,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); @@ -368,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -459,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain == BBR_UNIT) return is_full_length; /* just use wall clock time */ - inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); bw = bbr_max_bw(sk); /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at @@ -487,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk) bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); bbr->cycle_mstamp = tp->delivered_mstamp; - bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : - bbr_pacing_gain[bbr->cycle_idx]; } /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ @@ -506,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_STARTUP; - bbr->pacing_gain = bbr_high_gain; - bbr->cwnd_gain = bbr_high_gain; } static void bbr_reset_probe_bw_mode(struct sock *sk) @@ -515,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW; - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = bbr_cwnd_gain; bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ } @@ -734,13 +762,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ - bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ - bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ tcp_sk(sk)->snd_ssthresh = bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && - tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -797,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr_probe_rtt_mode_ms > 0 && filter_expired && !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = BBR_UNIT; bbr_save_cwnd(sk); /* note cwnd so we can restore it */ bbr->probe_rtt_done_stamp = 0; } @@ -826,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bbr->idle_restart = 0; } +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = (bbr->lt_use_bw ? + BBR_UNIT : + bbr_pacing_gain[bbr->cycle_idx]); + bbr->cwnd_gain = bbr_cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); @@ -833,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); bbr_update_min_rtt(sk, rs); + bbr_update_gains(sk); } static void bbr_main(struct sock *sk, const struct rate_sample *rs) |