Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking updates from David Miller: 1) Add VF IPSEC offload support in ixgbe, from Shannon Nelson. 2) Add zero-copy AF_XDP support to i40e, from Björn Töpel. 3) All in-tree drivers are converted to {g,s}et_link_ksettings() so we can get rid of the {g,s}et_settings ethtool callbacks, from Michal Kubecek. 4) Add software timestamping to veth driver, from Michael Walle. 5) More work to make packet classifiers and actions lockless, from Vlad Buslov. 6) Support sticky FDB entries in bridge, from Nikolay Aleksandrov. 7) Add ipv6 version of IP_MULTICAST_ALL sockopt, from Andre Naujoks. 8) Support batching of XDP buffers in vhost_net, from Jason Wang. 9) Add flow dissector BPF hook, from Petar Penkov. 10) i40e vf --> generic iavf conversion, from Jesse Brandeburg. 11) Add NLA_REJECT netlink attribute policy type, to signal when users provide attributes in situations which don't make sense. From Johannes Berg. 12) Switch TCP and fair-queue scheduler over to earliest departure time model. From Eric Dumazet. 13) Improve guest receive performance by doing rx busy polling in tx path of vhost networking driver, from Tonghao Zhang. 14) Add per-cgroup local storage to bpf 15) Add reference tracking to BPF, from Joe Stringer. The verifier can now make sure that references taken to objects are properly released by the program. 16) Support in-place encryption in TLS, from Vakul Garg. 17) Add new taprio packet scheduler, from Vinicius Costa Gomes. 18) Lots of selftests additions, too numerous to mention one by one here but all of which are very much appreciated. 19) Support offloading of eBPF programs containing BPF to BPF calls in nfp driver, frm Quentin Monnet. 20) Move dpaa2_ptp driver out of staging, from Yangbo Lu. 21) Lots of u32 classifier cleanups and simplifications, from Al Viro. 22) Add new strict versions of netlink message parsers, and enable them for some situations. From David Ahern. 23) Evict neighbour entries on carrier down, also from David Ahern. 24) Support BPF sk_msg verdict programs with kTLS, from Daniel Borkmann and John Fastabend. 25) Add support for filtering route dumps, from David Ahern. 26) New igc Intel driver for 2.5G parts, from Sasha Neftin et al. 27) Allow vxlan enslavement to bridges in mlxsw driver, from Ido Schimmel. 28) Add queue and stack map types to eBPF, from Mauricio Vasquez B. 29) Add back byte-queue-limit support to r8169, with all the bug fixes in other areas of the driver it works now! From Florian Westphal and Heiner Kallweit. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2147 commits) tcp: add tcp_reset_xmit_timer() helper qed: Fix static checker warning Revert "be2net: remove desc field from be_eq_obj" Revert "net: simplify sock_poll_wait" net: socionext: Reset tx queue in ndo_stop net: socionext: Add dummy PHY register read in phy_write() net: socionext: Stop PHY before resetting netsec net: stmmac: Set OWN bit for jumbo frames arm64: dts: stratix10: Support Ethernet Jumbo frame tls: Add maintainers net: ethernet: ti: cpsw: unsync mcast entries while switch promisc mode octeontx2-af: Support for NIXLF's UCAST/PROMISC/ALLMULTI modes octeontx2-af: Support for setting MAC address octeontx2-af: Support for changing RSS algorithm octeontx2-af: NIX Rx flowkey configuration for RSS octeontx2-af: Install ucast and bcast pkt forwarding rules octeontx2-af: Add LMAC channel info to NIXLF_ALLOC response octeontx2-af: NPC MCAM and LDATA extract minimal configuration octeontx2-af: Enable packet length and csum validation octeontx2-af: Support for VTAG strip and capture ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-10-24 08:47:44 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-10-24 08:47:44 +0300
commit: 50b825d7e87f4cff7070df6eb26390152bb29537 (patch)
tree: ec82aba49ab0c4743266ff37e18c8304a0367d06 /net/ipv4/tcp_bbr.c
parent: a97a2d4d56ea596871b739d63d41b084733bd9fb (diff)
parent: 3f80e08f40cdb308589a49077c87632fa4508b21 (diff)
download: linux-50b825d7e87f4cff7070df6eb26390152bb29537.tar.xz
1 files changed, 72 insertions, 18 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 02ff2dde9609..9277abdd822a 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
 /* Skip TSO below the following bandwidth (bits/sec): */
 static const int bbr_min_tso_rate = 1200000;
 
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+static const int bbr_pacing_margin_percent = 1;
+
 /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
  * that will allow a smoothly increasing pacing rate that will double each RTT
  * and send the same number of packets per RTT that an un-paced, slow-starting
@@ -208,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
 {
 	unsigned int mss = tcp_sk(sk)->mss_cache;
 
-	if (!tcp_needs_internal_pacing(sk))
-		mss = tcp_mss_to_mtu(sk, mss);
 	rate *= mss;
 	rate *= gain;
 	rate >>= BBR_SCALE;
-	rate *= USEC_PER_SEC;
+	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
 	return rate >> BW_SCALE;
 }
 
 /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	u64 rate = bw;
 
@@ -257,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
 
 	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
 		bbr_init_pacing_rate_from_rtt(sk);
@@ -279,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
 	/* Sort of tcp_tso_autosize() but ignoring
 	 * driver provided sk_gso_max_size.
 	 */
-	bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+	bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
 		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
 	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
@@ -368,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
 	return cwnd;
 }
 
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 now_ns, edt_ns, interval_us;
+	u32 interval_delivered, inflight_at_edt;
+
+	now_ns = tp->tcp_clock_cache;
+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+	inflight_at_edt = inflight_now;
+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
+	if (interval_delivered >= inflight_at_edt)
+		return 0;
+	return inflight_at_edt - interval_delivered;
+}
+
 /* An optimization in BBR to reduce losses: On the first round of recovery, we
  * follow the packet conservation principle: send P packets per P packets acked.
  * After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -459,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
 	if (bbr->pacing_gain == BBR_UNIT)
 		return is_full_length;		/* just use wall clock time */
 
-	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
 	bw = bbr_max_bw(sk);
 
 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -487,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
 
 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
 	bbr->cycle_mstamp = tp->delivered_mstamp;
-	bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
-					    bbr_pacing_gain[bbr->cycle_idx];
 }
 
 /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -506,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->mode = BBR_STARTUP;
-	bbr->pacing_gain = bbr_high_gain;
-	bbr->cwnd_gain	 = bbr_high_gain;
 }
 
 static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -515,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->mode = BBR_PROBE_BW;
-	bbr->pacing_gain = BBR_UNIT;
-	bbr->cwnd_gain = bbr_cwnd_gain;
 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
 }
@@ -734,13 +762,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 
 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
-		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
 		tcp_sk(sk)->snd_ssthresh =
 				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
 	}	/* fall through to check if in-flight is already small: */
 	if (bbr->mode == BBR_DRAIN &&
-	    tcp_packets_in_flight(tcp_sk(sk)) <=
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
 }
@@ -797,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
-		bbr->pacing_gain = BBR_UNIT;
-		bbr->cwnd_gain = BBR_UNIT;
 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
 		bbr->probe_rtt_done_stamp = 0;
 	}
@@ -826,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 		bbr->idle_restart = 0;
 }
 
+static void bbr_update_gains(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		bbr->pacing_gain = bbr_high_gain;
+		bbr->cwnd_gain	 = bbr_high_gain;
+		break;
+	case BBR_DRAIN:
+		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
+		break;
+	case BBR_PROBE_BW:
+		bbr->pacing_gain = (bbr->lt_use_bw ?
+				    BBR_UNIT :
+				    bbr_pacing_gain[bbr->cycle_idx]);
+		bbr->cwnd_gain	 = bbr_cwnd_gain;
+		break;
+	case BBR_PROBE_RTT:
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain	 = BBR_UNIT;
+		break;
+	default:
+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+		break;
+	}
+}
+
 static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 {
 	bbr_update_bw(sk, rs);
@@ -833,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 	bbr_check_full_bw_reached(sk, rs);
 	bbr_check_drain(sk, rs);
 	bbr_update_min_rtt(sk, rs);
+	bbr_update_gains(sk);
 }
 
 static void bbr_main(struct sock *sk, const struct rate_sample *rs)
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-10-24 08:47:44 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-10-24 08:47:44 +0300
commit	50b825d7e87f4cff7070df6eb26390152bb29537 (patch)
tree	ec82aba49ab0c4743266ff37e18c8304a0367d06 /net/ipv4/tcp_bbr.c
parent	a97a2d4d56ea596871b739d63d41b084733bd9fb (diff)
parent	3f80e08f40cdb308589a49077c87632fa4508b21 (diff)
download	linux-50b825d7e87f4cff7070df6eb26390152bb29537.tar.xz