mptcp: implement wmem reservation

This leverages the previous commit to reserve the wmem required for the sendmsg() operation when the msk socket lock is first acquired. Some heuristics are used to get a reasonable [over] estimation of the whole memory required. If we can't forward alloc such amount fallback to a reasonable small chunk, otherwise enter the wait for memory path. When sendmsg() needs more memory it looks at wmem_reserved first and if that is exhausted, move more space from sk_forward_alloc. The reserved memory is not persistent and is released at the next socket unlock via the release_cb(). Overall this will simplify the next patch. Acked-by: Florian Westphal <fw@strlen.de> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Paolo Abeni <pabeni@redhat.com> 2020-11-27 13:10:23 +0300
committer: Jakub Kicinski <kuba@kernel.org> 2020-12-01 04:55:23 +0300
commit: e93da92896bc0ddc26e88bbc09e7e39b84366a38 (patch)
tree: e6620b22e75c17f5510dda10982e2a234ebe88bc /net/mptcp
parent: ad80b0fc6e7f56bb1b09af86749ff3014477cfe6 (diff)
download: linux-e93da92896bc0ddc26e88bbc09e7e39b84366a38.tar.xz
2 files changed, 86 insertions, 7 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 16e9cb1c79cc..07fe484eefd1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -873,6 +873,81 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
 		df->data_seq + df->data_len == msk->write_seq;
 }
 
+static int mptcp_wmem_with_overhead(int size)
+{
+	return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
+}
+
+static void __mptcp_wmem_reserve(struct sock *sk, int size)
+{
+	int amount = mptcp_wmem_with_overhead(size);
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	WARN_ON_ONCE(msk->wmem_reserved);
+	if (amount <= sk->sk_forward_alloc)
+		goto reserve;
+
+	/* under memory pressure try to reserve at most a single page
+	 * otherwise try to reserve the full estimate and fallback
+	 * to a single page before entering the error path
+	 */
+	if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
+	    !sk_wmem_schedule(sk, amount)) {
+		if (amount <= PAGE_SIZE)
+			goto nomem;
+
+		amount = PAGE_SIZE;
+		if (!sk_wmem_schedule(sk, amount))
+			goto nomem;
+	}
+
+reserve:
+	msk->wmem_reserved = amount;
+	sk->sk_forward_alloc -= amount;
+	return;
+
+nomem:
+	/* we will wait for memory on next allocation */
+	msk->wmem_reserved = -1;
+}
+
+static void __mptcp_update_wmem(struct sock *sk)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	if (!msk->wmem_reserved)
+		return;
+
+	if (msk->wmem_reserved < 0)
+		msk->wmem_reserved = 0;
+	if (msk->wmem_reserved > 0) {
+		sk->sk_forward_alloc += msk->wmem_reserved;
+		msk->wmem_reserved = 0;
+	}
+}
+
+static bool mptcp_wmem_alloc(struct sock *sk, int size)
+{
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	/* check for pre-existing error condition */
+	if (msk->wmem_reserved < 0)
+		return false;
+
+	if (msk->wmem_reserved >= size)
+		goto account;
+
+	if (!sk_wmem_schedule(sk, size))
+		return false;
+
+	sk->sk_forward_alloc -= size;
+	msk->wmem_reserved += size;
+
+account:
+	msk->wmem_reserved -= size;
+	return true;
+}
+
 static void dfrag_uncharge(struct sock *sk, int len)
 {
 	sk_mem_uncharge(sk, len);
@@ -930,7 +1005,7 @@ static void mptcp_clean_una(struct sock *sk)
 	}
 
 out:
-	if (cleaned)
+	if (cleaned && tcp_under_memory_pressure(sk))
 		sk_mem_reclaim_partial(sk);
 }
 
@@ -1307,7 +1382,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
 		return -EOPNOTSUPP;
 
-	lock_sock(sk);
+	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len));
 
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
@@ -1356,11 +1431,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		offset = dfrag->offset + dfrag->data_len;
 		psize = pfrag->size - offset;
 		psize = min_t(size_t, psize, msg_data_left(msg));
-		if (!sk_wmem_schedule(sk, psize + frag_truesize))
+		if (!mptcp_wmem_alloc(sk, psize + frag_truesize))
 			goto wait_for_memory;
 
 		if (copy_page_from_iter(dfrag->page, offset, psize,
 					&msg->msg_iter) != psize) {
+			msk->wmem_reserved += psize + frag_truesize;
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1376,7 +1452,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		 * Note: we charge such data both to sk and ssk
 		 */
 		sk_wmem_queued_add(sk, frag_truesize);
-		sk->sk_forward_alloc -= frag_truesize;
 		if (!dfrag_collapsed) {
 			get_page(dfrag->page);
 			list_add_tail(&dfrag->list, &msk->rtx_queue);
@@ -2003,6 +2078,7 @@ static int __mptcp_init_sock(struct sock *sk)
 	INIT_WORK(&msk->work, mptcp_worker);
 	msk->out_of_order_queue = RB_ROOT;
 	msk->first_pending = NULL;
+	msk->wmem_reserved = 0;
 
 	msk->ack_hint = NULL;
 	msk->first = NULL;
@@ -2197,6 +2273,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
 
 	sk->sk_prot->destroy(sk);
 
+	WARN_ON_ONCE(msk->wmem_reserved);
 	sk_stream_kill_queues(sk);
 	xfrm_sk_free_policy(sk);
 	sk_refcnt_debug_release(sk);
@@ -2542,13 +2619,14 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
 
 #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
 
-/* this is very alike tcp_release_cb() but we must handle differently a
- * different set of events
- */
+/* processes deferred events and flush wmem */
 static void mptcp_release_cb(struct sock *sk)
 {
 	unsigned long flags, nflags;
 
+	/* clear any wmem reservation and errors */
+	__mptcp_update_wmem(sk);
+
 	do {
 		flags = sk->sk_tsq_flags;
 		if (!(flags & MPTCP_DEFERRED_ALL))
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 6abac8238de3..4cf355076e35 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -218,6 +218,7 @@ struct mptcp_sock {
 	u64		ack_seq;
 	u64		rcv_wnd_sent;
 	u64		rcv_data_fin_seq;
+	int		wmem_reserved;
 	struct sock	*last_snd;
 	int		snd_burst;
 	int		old_wspace;
author	Paolo Abeni <pabeni@redhat.com>	2020-11-27 13:10:23 +0300
committer	Jakub Kicinski <kuba@kernel.org>	2020-12-01 04:55:23 +0300
commit	e93da92896bc0ddc26e88bbc09e7e39b84366a38 (patch)
tree	e6620b22e75c17f5510dda10982e2a234ebe88bc /net/mptcp
parent	ad80b0fc6e7f56bb1b09af86749ff3014477cfe6 (diff)
download	linux-e93da92896bc0ddc26e88bbc09e7e39b84366a38.tar.xz