diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 00:22:29 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 00:22:29 +0300 |
commit | d635a69dd4981cc51f90293f5f64268620ed1565 (patch) | |
tree | 5e0a758b402ea7d624c25c3a343545dd29e80f31 /net/mptcp/subflow.c | |
parent | ac73e3dc8acd0a3be292755db30388c3580f5674 (diff) | |
parent | efd5a1584537698220578227e6467638307c2a0b (diff) | |
download | linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.xz |
Merge tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski:
"Core:
- support "prefer busy polling" NAPI operation mode, where we defer
softirq for some time expecting applications to periodically busy
poll
- AF_XDP: improve efficiency by more batching and hindering the
adjacency cache prefetcher
- af_packet: make packet_fanout.arr size configurable up to 64K
- tcp: optimize TCP zero copy receive in presence of partial or
unaligned reads making zero copy a performance win for much smaller
messages
- XDP: add bulk APIs for returning / freeing frames
- sched: support fragmenting IP packets as they come out of conntrack
- net: allow virtual netdevs to forward UDP L4 and fraglist GSO skbs
BPF:
- BPF switch from crude rlimit-based to memcg-based memory accounting
- BPF type format information for kernel modules and related tracing
enhancements
- BPF implement task local storage for BPF LSM
- allow the FENTRY/FEXIT/RAW_TP tracing programs to use
bpf_sk_storage
Protocols:
- mptcp: improve multiple xmit streams support, memory accounting and
many smaller improvements
- TLS: support CHACHA20-POLY1305 cipher
- seg6: add support for SRv6 End.DT4/DT6 behavior
- sctp: Implement RFC 6951: UDP Encapsulation of SCTP
- ppp_generic: add ability to bridge channels directly
- bridge: Connectivity Fault Management (CFM) support as is defined
in IEEE 802.1Q section 12.14.
Drivers:
- mlx5: make use of the new auxiliary bus to organize the driver
internals
- mlx5: more accurate port TX timestamping support
- mlxsw:
- improve the efficiency of offloaded next hop updates by using
the new nexthop object API
- support blackhole nexthops
- support IEEE 802.1ad (Q-in-Q) bridging
- rtw88: major bluetooth co-existance improvements
- iwlwifi: support new 6 GHz frequency band
- ath11k: Fast Initial Link Setup (FILS)
- mt7915: dual band concurrent (DBDC) support
- net: ipa: add basic support for IPA v4.5
Refactor:
- a few pieces of in_interrupt() cleanup work from Sebastian Andrzej
Siewior
- phy: add support for shared interrupts; get rid of multiple driver
APIs and have the drivers write a full IRQ handler, slight growth
of driver code should be compensated by the simpler API which also
allows shared IRQs
- add common code for handling netdev per-cpu counters
- move TX packet re-allocation from Ethernet switch tag drivers to a
central place
- improve efficiency and rename nla_strlcpy
- number of W=1 warning cleanups as we now catch those in a patchwork
build bot
Old code removal:
- wan: delete the DLCI / SDLA drivers
- wimax: move to staging
- wifi: remove old WDS wifi bridging support"
* tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1922 commits)
net: hns3: fix expression that is currently always true
net: fix proc_fs init handling in af_packet and tls
nfc: pn533: convert comma to semicolon
af_vsock: Assign the vsock transport considering the vsock address flags
af_vsock: Set VMADDR_FLAG_TO_HOST flag on the receive path
vsock_addr: Check for supported flag values
vm_sockets: Add VMADDR_FLAG_TO_HOST vsock flag
vm_sockets: Add flags field in the vsock address data structure
net: Disable NETIF_F_HW_TLS_TX when HW_CSUM is disabled
tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit
net: mscc: ocelot: install MAC addresses in .ndo_set_rx_mode from process context
nfc: s3fwrn5: Release the nfc firmware
net: vxget: clean up sparse warnings
mlxsw: spectrum_router: Use eXtended mezzanine to offload IPv4 router
mlxsw: spectrum: Set KVH XLT cache mode for Spectrum2/3
mlxsw: spectrum_router_xm: Introduce basic XM cache flushing
mlxsw: reg: Add Router LPM Cache Enable Register
mlxsw: reg: Add Router LPM Cache ML Delete Register
mlxsw: spectrum_router_xm: Implement L-value tracking for M-index
mlxsw: reg: Add XM Router M Table Register
...
Diffstat (limited to 'net/mptcp/subflow.c')
-rw-r--r-- | net/mptcp/subflow.c | 165 |
1 files changed, 119 insertions, 46 deletions
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c21a852a6ffa..278cbe3e539e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li return 0; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +/* Init mptcp request socket. + * + * Returns an error code if a JOIN has failed and a TCP reset + * should be sent. + */ +static int subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req, ret = __subflow_init_req(req, sk_listener); if (ret) - return; + return 0; mptcp_get_options(skb, &mp_opt); @@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req, SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (mp_opt.mp_join) - return; + return 0; } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } @@ -157,7 +162,7 @@ again: } else { subflow_req->mp_capable = 1; } - return; + return 0; } err = mptcp_token_new_request(req); @@ -175,7 +180,11 @@ again: subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); - if (unlikely(req->syncookie) && subflow_req->msk) { + /* Can't fall back to TCP in this case. */ + if (!subflow_req->msk) + return -EPERM; + + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); } @@ -183,6 +192,8 @@ again: pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } + + return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, @@ -228,27 +239,53 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); -static void subflow_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v4_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; + + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; - subflow_init_req(req, sk_listener, skb); + dst_release(dst); + if (!req->syncookie) + tcp_request_sock_ops.send_reset(sk, skb); + return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static void subflow_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v6_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - subflow_init_req(req, sk_listener, skb); + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; + + dst_release(dst); + if (!req->syncookie) + tcp6_request_sock_ops.send_reset(sk, skb); + return NULL; } #endif @@ -276,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) @@ -577,6 +619,11 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* record the newly created socket as the first msk + * subflow, but don't link it yet into conn_list + */ + WRITE_ONCE(mptcp_sk(new_msk)->first, child); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ @@ -845,8 +892,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk) @@ -968,7 +1013,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; - *space = tcp_space(sk); + *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } @@ -993,20 +1038,9 @@ static void subflow_data_ready(struct sock *sk) mptcp_data_ready(parent, sk); } -static void subflow_write_space(struct sock *sk) +static void subflow_write_space(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; - - if (!sk_stream_is_writeable(sk)) - return; - - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + /* we take action in __mptcp_clean_una() */ } static struct inet_connection_sock_af_ops * @@ -1120,21 +1154,48 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr); + mptcp_add_pending_subflow(msk, subflow); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) - goto failed; + goto failed_unlink; + return err; + +failed_unlink: spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); + list_del(&subflow->node); spin_unlock_bh(&msk->join_list_lock); - return err; - failed: + subflow->disposable = 1; sock_release(sf); return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1155,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ @@ -1253,7 +1317,6 @@ static void subflow_state_change(struct sock *sk) mptcp_data_ready(parent, sk); if (__mptcp_check_fallback(mptcp_sk(parent)) && - !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1296,17 +1359,27 @@ out: return err; } -static void subflow_ulp_release(struct sock *sk) +static void subflow_ulp_release(struct sock *ssk) { - struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + bool release = true; + struct sock *sk; if (!ctx) return; - if (ctx->conn) - sock_put(ctx->conn); + sk = ctx->conn; + if (sk) { + /* if the msk has been orphaned, keep the ctx + * alive, will be freed by __mptcp_close_ssk(), + * when the subflow is still unaccepted + */ + release = ctx->disposable || list_empty(&ctx->node); + sock_put(sk); + } - kfree_rcu(ctx, rcu); + if (release) + kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, @@ -1391,7 +1464,7 @@ void __init mptcp_subflow_init(void) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1400,7 +1473,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; |