Merge tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next

Pull networking updates from Jakub Kicinski: "Core: - support "prefer busy polling" NAPI operation mode, where we defer softirq for some time expecting applications to periodically busy poll - AF_XDP: improve efficiency by more batching and hindering the adjacency cache prefetcher - af_packet: make packet_fanout.arr size configurable up to 64K - tcp: optimize TCP zero copy receive in presence of partial or unaligned reads making zero copy a performance win for much smaller messages - XDP: add bulk APIs for returning / freeing frames - sched: support fragmenting IP packets as they come out of conntrack - net: allow virtual netdevs to forward UDP L4 and fraglist GSO skbs BPF: - BPF switch from crude rlimit-based to memcg-based memory accounting - BPF type format information for kernel modules and related tracing enhancements - BPF implement task local storage for BPF LSM - allow the FENTRY/FEXIT/RAW_TP tracing programs to use bpf_sk_storage Protocols: - mptcp: improve multiple xmit streams support, memory accounting and many smaller improvements - TLS: support CHACHA20-POLY1305 cipher - seg6: add support for SRv6 End.DT4/DT6 behavior - sctp: Implement RFC 6951: UDP Encapsulation of SCTP - ppp_generic: add ability to bridge channels directly - bridge: Connectivity Fault Management (CFM) support as is defined in IEEE 802.1Q section 12.14. Drivers: - mlx5: make use of the new auxiliary bus to organize the driver internals - mlx5: more accurate port TX timestamping support - mlxsw: - improve the efficiency of offloaded next hop updates by using the new nexthop object API - support blackhole nexthops - support IEEE 802.1ad (Q-in-Q) bridging - rtw88: major bluetooth co-existance improvements - iwlwifi: support new 6 GHz frequency band - ath11k: Fast Initial Link Setup (FILS) - mt7915: dual band concurrent (DBDC) support - net: ipa: add basic support for IPA v4.5 Refactor: - a few pieces of in_interrupt() cleanup work from Sebastian Andrzej Siewior - phy: add support for shared interrupts; get rid of multiple driver APIs and have the drivers write a full IRQ handler, slight growth of driver code should be compensated by the simpler API which also allows shared IRQs - add common code for handling netdev per-cpu counters - move TX packet re-allocation from Ethernet switch tag drivers to a central place - improve efficiency and rename nla_strlcpy - number of W=1 warning cleanups as we now catch those in a patchwork build bot Old code removal: - wan: delete the DLCI / SDLA drivers - wimax: move to staging - wifi: remove old WDS wifi bridging support" * tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1922 commits) net: hns3: fix expression that is currently always true net: fix proc_fs init handling in af_packet and tls nfc: pn533: convert comma to semicolon af_vsock: Assign the vsock transport considering the vsock address flags af_vsock: Set VMADDR_FLAG_TO_HOST flag on the receive path vsock_addr: Check for supported flag values vm_sockets: Add VMADDR_FLAG_TO_HOST vsock flag vm_sockets: Add flags field in the vsock address data structure net: Disable NETIF_F_HW_TLS_TX when HW_CSUM is disabled tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit net: mscc: ocelot: install MAC addresses in .ndo_set_rx_mode from process context nfc: s3fwrn5: Release the nfc firmware net: vxget: clean up sparse warnings mlxsw: spectrum_router: Use eXtended mezzanine to offload IPv4 router mlxsw: spectrum: Set KVH XLT cache mode for Spectrum2/3 mlxsw: spectrum_router_xm: Introduce basic XM cache flushing mlxsw: reg: Add Router LPM Cache Enable Register mlxsw: reg: Add Router LPM Cache ML Delete Register mlxsw: spectrum_router_xm: Implement L-value tracking for M-index mlxsw: reg: Add XM Router M Table Register ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2020-12-16 00:22:29 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2020-12-16 00:22:29 +0300
commit: d635a69dd4981cc51f90293f5f64268620ed1565 (patch)
tree: 5e0a758b402ea7d624c25c3a343545dd29e80f31 /net/mptcp/subflow.c
parent: ac73e3dc8acd0a3be292755db30388c3580f5674 (diff)
parent: efd5a1584537698220578227e6467638307c2a0b (diff)
download: linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.xz
1 files changed, 119 insertions, 46 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index c21a852a6ffa..278cbe3e539e 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li
 	return 0;
 }
 
-static void subflow_init_req(struct request_sock *req,
-			     const struct sock *sk_listener,
-			     struct sk_buff *skb)
+/* Init mptcp request socket.
+ *
+ * Returns an error code if a JOIN has failed and a TCP reset
+ * should be sent.
+ */
+static int subflow_init_req(struct request_sock *req,
+			    const struct sock *sk_listener,
+			    struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
@@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req,
 
 	ret = __subflow_init_req(req, sk_listener);
 	if (ret)
-		return;
+		return 0;
 
 	mptcp_get_options(skb, &mp_opt);
 
@@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req,
 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
 
 		if (mp_opt.mp_join)
-			return;
+			return 0;
 	} else if (mp_opt.mp_join) {
 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
 	}
@@ -157,7 +162,7 @@ again:
 			} else {
 				subflow_req->mp_capable = 1;
 			}
-			return;
+			return 0;
 		}
 
 		err = mptcp_token_new_request(req);
@@ -175,7 +180,11 @@ again:
 		subflow_req->remote_nonce = mp_opt.nonce;
 		subflow_req->msk = subflow_token_join_request(req, skb);
 
-		if (unlikely(req->syncookie) && subflow_req->msk) {
+		/* Can't fall back to TCP in this case. */
+		if (!subflow_req->msk)
+			return -EPERM;
+
+		if (unlikely(req->syncookie)) {
 			if (mptcp_can_accept_new_subflow(subflow_req->msk))
 				subflow_init_req_cookie_join_save(subflow_req, skb);
 		}
@@ -183,6 +192,8 @@ again:
 		pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
 			 subflow_req->remote_nonce, subflow_req->msk);
 	}
+
+	return 0;
 }
 
 int mptcp_subflow_init_cookie_req(struct request_sock *req,
@@ -228,27 +239,53 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
 }
 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
 
-static void subflow_v4_init_req(struct request_sock *req,
-				const struct sock *sk_listener,
-				struct sk_buff *skb)
+static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
+					      struct sk_buff *skb,
+					      struct flowi *fl,
+					      struct request_sock *req)
 {
+	struct dst_entry *dst;
+	int err;
+
 	tcp_rsk(req)->is_mptcp = 1;
 
-	tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+	if (!dst)
+		return NULL;
+
+	err = subflow_init_req(req, sk, skb);
+	if (err == 0)
+		return dst;
 
-	subflow_init_req(req, sk_listener, skb);
+	dst_release(dst);
+	if (!req->syncookie)
+		tcp_request_sock_ops.send_reset(sk, skb);
+	return NULL;
 }
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static void subflow_v6_init_req(struct request_sock *req,
-				const struct sock *sk_listener,
-				struct sk_buff *skb)
+static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
+					      struct sk_buff *skb,
+					      struct flowi *fl,
+					      struct request_sock *req)
 {
+	struct dst_entry *dst;
+	int err;
+
 	tcp_rsk(req)->is_mptcp = 1;
 
-	tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+	if (!dst)
+		return NULL;
 
-	subflow_init_req(req, sk_listener, skb);
+	err = subflow_init_req(req, sk, skb);
+	if (err == 0)
+		return dst;
+
+	dst_release(dst);
+	if (!req->syncookie)
+		tcp6_request_sock_ops.send_reset(sk, skb);
+	return NULL;
 }
 #endif
 
@@ -276,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	struct sock *sk = subflow->conn;
 
+	/* must hold: tcp_done() could drop last reference on parent */
+	sock_hold(sk);
+
 	tcp_set_state(ssk, TCP_CLOSE);
 	tcp_send_active_reset(ssk, GFP_ATOMIC);
 	tcp_done(ssk);
 	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
 	    schedule_work(&mptcp_sk(sk)->work))
-		sock_hold(sk);
+		return; /* worker will put sk for us */
+
+	sock_put(sk);
 }
 
 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
@@ -577,6 +619,11 @@ create_child:
 			 */
 			inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
 
+			/* record the newly created socket as the first msk
+			 * subflow, but don't link it yet into conn_list
+			 */
+			WRITE_ONCE(mptcp_sk(new_msk)->first, child);
+
 			/* new mpc subflow takes ownership of the newly
 			 * created mptcp socket
 			 */
@@ -845,8 +892,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
 		sk_eat_skb(ssk, skb);
 	if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
 		subflow->map_valid = 0;
-	if (incr)
-		tcp_cleanup_rbuf(ssk, incr);
 }
 
 static bool subflow_check_data_avail(struct sock *ssk)
@@ -968,7 +1013,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
 	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
 	const struct sock *sk = subflow->conn;
 
-	*space = tcp_space(sk);
+	*space = __mptcp_space(sk);
 	*full_space = tcp_full_space(sk);
 }
 
@@ -993,20 +1038,9 @@ static void subflow_data_ready(struct sock *sk)
 		mptcp_data_ready(parent, sk);
 }
 
-static void subflow_write_space(struct sock *sk)
+static void subflow_write_space(struct sock *ssk)
 {
-	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-	struct sock *parent = subflow->conn;
-
-	if (!sk_stream_is_writeable(sk))
-		return;
-
-	if (sk_stream_is_writeable(parent)) {
-		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
-		smp_mb__after_atomic();
-		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
-		sk_stream_write_space(parent);
-	}
+	/* we take action in __mptcp_clean_una() */
 }
 
 static struct inet_connection_sock_af_ops *
@@ -1120,21 +1154,48 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
 	subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
 	mptcp_info2sockaddr(remote, &addr);
 
+	mptcp_add_pending_subflow(msk, subflow);
 	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
 	if (err && err != -EINPROGRESS)
-		goto failed;
+		goto failed_unlink;
 
+	return err;
+
+failed_unlink:
 	spin_lock_bh(&msk->join_list_lock);
-	list_add_tail(&subflow->node, &msk->join_list);
+	list_del(&subflow->node);
 	spin_unlock_bh(&msk->join_list_lock);
 
-	return err;
-
 failed:
+	subflow->disposable = 1;
 	sock_release(sf);
 	return err;
 }
 
+static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
+				*child_skcd = &child->sk_cgrp_data;
+
+	/* only the additional subflows created by kworkers have to be modified */
+	if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
+	    cgroup_id(sock_cgroup_ptr(child_skcd))) {
+#ifdef CONFIG_MEMCG
+		struct mem_cgroup *memcg = parent->sk_memcg;
+
+		mem_cgroup_sk_free(child);
+		if (memcg && css_tryget(&memcg->css))
+			child->sk_memcg = memcg;
+#endif /* CONFIG_MEMCG */
+
+		cgroup_sk_free(child_skcd);
+		*child_skcd = *parent_skcd;
+		cgroup_sk_clone(child_skcd);
+	}
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+}
+
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 {
 	struct mptcp_subflow_context *subflow;
@@ -1155,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 
 	lock_sock(sf->sk);
 
+	/* the newly created socket has to be in the same cgroup as its parent */
+	mptcp_attach_cgroup(sk, sf->sk);
+
 	/* kernel sockets do not by default acquire net ref, but TCP timer
 	 * needs it.
 	 */
@@ -1253,7 +1317,6 @@ static void subflow_state_change(struct sock *sk)
 		mptcp_data_ready(parent, sk);
 
 	if (__mptcp_check_fallback(mptcp_sk(parent)) &&
-	    !(parent->sk_shutdown & RCV_SHUTDOWN) &&
 	    !subflow->rx_eof && subflow_is_done(sk)) {
 		subflow->rx_eof = 1;
 		mptcp_subflow_eof(parent);
@@ -1296,17 +1359,27 @@ out:
 	return err;
 }
 
-static void subflow_ulp_release(struct sock *sk)
+static void subflow_ulp_release(struct sock *ssk)
 {
-	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+	bool release = true;
+	struct sock *sk;
 
 	if (!ctx)
 		return;
 
-	if (ctx->conn)
-		sock_put(ctx->conn);
+	sk = ctx->conn;
+	if (sk) {
+		/* if the msk has been orphaned, keep the ctx
+		 * alive, will be freed by __mptcp_close_ssk(),
+		 * when the subflow is still unaccepted
+		 */
+		release = ctx->disposable || list_empty(&ctx->node);
+		sock_put(sk);
+	}
 
-	kfree_rcu(ctx, rcu);
+	if (release)
+		kfree_rcu(ctx, rcu);
 }
 
 static void subflow_ulp_clone(const struct request_sock *req,
@@ -1391,7 +1464,7 @@ void __init mptcp_subflow_init(void)
 		panic("MPTCP: failed to init subflow request sock ops\n");
 
 	subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
-	subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+	subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
 
 	subflow_specific = ipv4_specific;
 	subflow_specific.conn_request = subflow_v4_conn_request;
@@ -1400,7 +1473,7 @@ void __init mptcp_subflow_init(void)
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 	subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
-	subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+	subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
 
 	subflow_v6_specific = ipv6_specific;
 	subflow_v6_specific.conn_request = subflow_v6_conn_request;
author	Linus Torvalds <torvalds@linux-foundation.org>	2020-12-16 00:22:29 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2020-12-16 00:22:29 +0300
commit	d635a69dd4981cc51f90293f5f64268620ed1565 (patch)
tree	5e0a758b402ea7d624c25c3a343545dd29e80f31 /net/mptcp/subflow.c
parent	ac73e3dc8acd0a3be292755db30388c3580f5674 (diff)
parent	efd5a1584537698220578227e6467638307c2a0b (diff)
download	linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.xz