mptcp: enforce HoL-blocking estimation

The MPTCP packet scheduler has sub-optimal behavior with asymmetric subflows: if the faster subflow-level cwin is closed, the packet scheduler can enqueue "too much" data on a slower subflow. When all the data on the faster subflow is acked, if the mptcp-level cwin is closed, and link utilization becomes suboptimal. The solution is implementing blest-like[1] HoL-blocking estimation, transmitting only on the subflow with the shorter estimated time to flush the queued memory. If such subflows cwin is closed, we wait even if other subflows are available. This is quite simpler than the original blest implementation, as we leverage the pacing rate provided by the TCP socket. To get a more accurate estimation for the subflow linger-time, we maintain a per-subflow weighted average of such info. Additionally drop magic numbers usage in favor of newly defined macros and use more meaningful names for status variable. [1] http://dl.ifip.org/db/conf/networking/networking2016/1570234725.pdf Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/137 Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Paolo Abeni <pabeni@redhat.com> 2021-12-18 02:37:00 +0300
committer: Jakub Kicinski <kuba@kernel.org> 2021-12-18 06:27:04 +0300
commit: 3ce0852c86b926aed7bb8c69b09c5ad4ba0a9dfb (patch)
tree: eba6743dcbe8ca41c57282f32ffb8d332745f81c /net/mptcp
parent: ab9d0e2171be56bb3bcd0fe4bd8ae5d2f24e5a80 (diff)
download: linux-3ce0852c86b926aed7bb8c69b09c5ad4ba0a9dfb.tar.xz
2 files changed, 48 insertions, 25 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3e549f6190c0..df5a0cf431c1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1372,7 +1372,7 @@ out:
 
 struct subflow_send_info {
 	struct sock *ssk;
-	u64 ratio;
+	u64 linger_time;
 };
 
 void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
 	return __mptcp_subflow_active(subflow);
 }
 
+#define SSK_MODE_ACTIVE	0
+#define SSK_MODE_BACKUP	1
+#define SSK_MODE_MAX	2
+
 /* implement the mptcp packet scheduler;
  * returns the subflow that will transmit the next DSS
  * additionally updates the rtx timeout
  */
 static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 {
-	struct subflow_send_info send_info[2];
+	struct subflow_send_info send_info[SSK_MODE_MAX];
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
+	u32 pace, burst, wmem;
 	int i, nr_active = 0;
 	struct sock *ssk;
+	u64 linger_time;
 	long tout = 0;
-	u64 ratio;
-	u32 pace;
 
 	sock_owned_by_me(sk);
 
@@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 	}
 
 	/* pick the subflow with the lower wmem/wspace ratio */
-	for (i = 0; i < 2; ++i) {
+	for (i = 0; i < SSK_MODE_MAX; ++i) {
 		send_info[i].ssk = NULL;
-		send_info[i].ratio = -1;
+		send_info[i].linger_time = -1;
 	}
+
 	mptcp_for_each_subflow(msk, subflow) {
 		trace_mptcp_subflow_get_send(subflow);
 		ssk =  mptcp_subflow_tcp_sock(subflow);
@@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
 
 		tout = max(tout, mptcp_timeout_from_subflow(subflow));
 		nr_active += !subflow->backup;
-		if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
-			continue;
-
-		pace = READ_ONCE(ssk->sk_pacing_rate);
-		if (!pace)
-			continue;
+		pace = subflow->avg_pacing_rate;
+		if (unlikely(!pace)) {
+			/* init pacing rate from socket */
+			subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+			pace = subflow->avg_pacing_rate;
+			if (!pace)
+				continue;
+		}
 
-		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
-				pace);
-		if (ratio < send_info[subflow->backup].ratio) {
+		linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+		if (linger_time < send_info[subflow->backup].linger_time) {
 			send_info[subflow->backup].ssk = ssk;
-			send_info[subflow->backup].ratio = ratio;
+			send_info[subflow->backup].linger_time = linger_time;
 		}
 	}
 	__mptcp_set_timeout(sk, tout);
 
 	/* pick the best backup if no other subflow is active */
 	if (!nr_active)
-		send_info[0].ssk = send_info[1].ssk;
-
-	if (send_info[0].ssk) {
-		msk->last_snd = send_info[0].ssk;
-		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
-				       tcp_sk(msk->last_snd)->snd_wnd);
-		return msk->last_snd;
-	}
+		send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+	/* According to the blest algorithm, to avoid HoL blocking for the
+	 * faster flow, we need to:
+	 * - estimate the faster flow linger time
+	 * - use the above to estimate the amount of byte transferred
+	 *   by the faster flow
+	 * - check that the amount of queued data is greter than the above,
+	 *   otherwise do not use the picked, slower, subflow
+	 * We select the subflow with the shorter estimated time to flush
+	 * the queued mem, which basically ensure the above. We just need
+	 * to check that subflow has a non empty cwin.
+	 */
+	ssk = send_info[SSK_MODE_ACTIVE].ssk;
+	if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
+		return NULL;
 
-	return NULL;
+	burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
+	wmem = READ_ONCE(ssk->sk_wmem_queued);
+	subflow = mptcp_subflow_ctx(ssk);
+	subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+					   READ_ONCE(ssk->sk_pacing_rate) * burst,
+					   burst + wmem);
+	msk->last_snd = ssk;
+	msk->snd_burst = burst;
+	return ssk;
 }
 
 static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e1469155fb15..0486c9f5b38b 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
 /* MPTCP subflow context */
 struct mptcp_subflow_context {
 	struct	list_head node;/* conn_list of subflows */
+	unsigned long avg_pacing_rate; /* protected by msk socket lock */
 	u64	local_key;
 	u64	remote_key;
 	u64	idsn;
author	Paolo Abeni <pabeni@redhat.com>	2021-12-18 02:37:00 +0300
committer	Jakub Kicinski <kuba@kernel.org>	2021-12-18 06:27:04 +0300
commit	3ce0852c86b926aed7bb8c69b09c5ad4ba0a9dfb (patch)
tree	eba6743dcbe8ca41c57282f32ffb8d332745f81c /net/mptcp
parent	ab9d0e2171be56bb3bcd0fe4bd8ae5d2f24e5a80 (diff)
download	linux-3ce0852c86b926aed7bb8c69b09c5ad4ba0a9dfb.tar.xz