diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 00:22:29 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-16 00:22:29 +0300 |
commit | d635a69dd4981cc51f90293f5f64268620ed1565 (patch) | |
tree | 5e0a758b402ea7d624c25c3a343545dd29e80f31 /net/mptcp/options.c | |
parent | ac73e3dc8acd0a3be292755db30388c3580f5674 (diff) | |
parent | efd5a1584537698220578227e6467638307c2a0b (diff) | |
download | linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.xz |
Merge tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski:
"Core:
- support "prefer busy polling" NAPI operation mode, where we defer
softirq for some time expecting applications to periodically busy
poll
- AF_XDP: improve efficiency by more batching and hindering the
adjacency cache prefetcher
- af_packet: make packet_fanout.arr size configurable up to 64K
- tcp: optimize TCP zero copy receive in presence of partial or
unaligned reads making zero copy a performance win for much smaller
messages
- XDP: add bulk APIs for returning / freeing frames
- sched: support fragmenting IP packets as they come out of conntrack
- net: allow virtual netdevs to forward UDP L4 and fraglist GSO skbs
BPF:
- BPF switch from crude rlimit-based to memcg-based memory accounting
- BPF type format information for kernel modules and related tracing
enhancements
- BPF implement task local storage for BPF LSM
- allow the FENTRY/FEXIT/RAW_TP tracing programs to use
bpf_sk_storage
Protocols:
- mptcp: improve multiple xmit streams support, memory accounting and
many smaller improvements
- TLS: support CHACHA20-POLY1305 cipher
- seg6: add support for SRv6 End.DT4/DT6 behavior
- sctp: Implement RFC 6951: UDP Encapsulation of SCTP
- ppp_generic: add ability to bridge channels directly
- bridge: Connectivity Fault Management (CFM) support as is defined
in IEEE 802.1Q section 12.14.
Drivers:
- mlx5: make use of the new auxiliary bus to organize the driver
internals
- mlx5: more accurate port TX timestamping support
- mlxsw:
- improve the efficiency of offloaded next hop updates by using
the new nexthop object API
- support blackhole nexthops
- support IEEE 802.1ad (Q-in-Q) bridging
- rtw88: major bluetooth co-existance improvements
- iwlwifi: support new 6 GHz frequency band
- ath11k: Fast Initial Link Setup (FILS)
- mt7915: dual band concurrent (DBDC) support
- net: ipa: add basic support for IPA v4.5
Refactor:
- a few pieces of in_interrupt() cleanup work from Sebastian Andrzej
Siewior
- phy: add support for shared interrupts; get rid of multiple driver
APIs and have the drivers write a full IRQ handler, slight growth
of driver code should be compensated by the simpler API which also
allows shared IRQs
- add common code for handling netdev per-cpu counters
- move TX packet re-allocation from Ethernet switch tag drivers to a
central place
- improve efficiency and rename nla_strlcpy
- number of W=1 warning cleanups as we now catch those in a patchwork
build bot
Old code removal:
- wan: delete the DLCI / SDLA drivers
- wimax: move to staging
- wifi: remove old WDS wifi bridging support"
* tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1922 commits)
net: hns3: fix expression that is currently always true
net: fix proc_fs init handling in af_packet and tls
nfc: pn533: convert comma to semicolon
af_vsock: Assign the vsock transport considering the vsock address flags
af_vsock: Set VMADDR_FLAG_TO_HOST flag on the receive path
vsock_addr: Check for supported flag values
vm_sockets: Add VMADDR_FLAG_TO_HOST vsock flag
vm_sockets: Add flags field in the vsock address data structure
net: Disable NETIF_F_HW_TLS_TX when HW_CSUM is disabled
tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit
net: mscc: ocelot: install MAC addresses in .ndo_set_rx_mode from process context
nfc: s3fwrn5: Release the nfc firmware
net: vxget: clean up sparse warnings
mlxsw: spectrum_router: Use eXtended mezzanine to offload IPv4 router
mlxsw: spectrum: Set KVH XLT cache mode for Spectrum2/3
mlxsw: spectrum_router_xm: Introduce basic XM cache flushing
mlxsw: reg: Add Router LPM Cache Enable Register
mlxsw: reg: Add Router LPM Cache ML Delete Register
mlxsw: spectrum_router_xm: Implement L-value tracking for M-index
mlxsw: reg: Add XM Router M Table Register
...
Diffstat (limited to 'net/mptcp/options.c')
-rw-r--r-- | net/mptcp/options.c | 218 |
1 files changed, 163 insertions, 55 deletions
diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 90cd52df99a6..c5328f407aab 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -242,7 +242,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -267,6 +266,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); break; case MPTCPOPT_RM_ADDR: @@ -280,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -297,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -492,7 +505,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); + snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -528,6 +541,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -573,27 +587,43 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, } #endif -static bool mptcp_established_options_add_addr(struct sock *sk, +static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + bool drop_other_suboptions = false; + unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; + bool port; int len; + if ((mptcp_pm_should_add_signal_ipv6(msk) || + mptcp_pm_should_add_signal_port(msk)) && + skb && skb_is_tcp_pure_ack(skb)) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + remaining += opt_size; + drop_other_suboptions = true; + } + if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) + !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo); + len = mptcp_add_addr_len(saddr.family, echo, port); if (remaining < len) return false; *size = len; + if (drop_other_suboptions) + *size -= opt_size; opts->addr_id = saddr.id; + if (port) + opts->port = ntohs(saddr.port); if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; opts->addr = saddr.addr; @@ -616,7 +646,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, } } #endif - pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo); + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + opts->addr_id, opts->ahmac, echo, opts->port); return true; } @@ -678,7 +709,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; - if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; @@ -759,6 +790,11 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto fully_established; } + if (mp_opt->add_addr) { + WRITE_ONCE(msk->fully_established, true); + return true; + } + /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. @@ -777,7 +813,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, mptcp_subflow_fully_established(subflow, mp_opt); fully_established: - if (likely(subflow->pm_notified)) + /* if the subflow is not already linked into the conn_list, we can't + * notify the PM: this subflow is still on the listener queue + * and the PM possibly acquiring the subflow lock could race with + * the listener close + */ + if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; @@ -809,31 +850,39 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_options_received *mp_opt) { - u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; + u64 old_snd_una; + + mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ + old_snd_una = msk->snd_una; new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; - while (after64(new_snd_una, old_snd_una)) { - snd_una = old_snd_una; - old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, - new_snd_una); - if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); - break; - } + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + if (after64(new_wnd_end, msk->wnd_end)) { + msk->wnd_end = new_wnd_end; + __mptcp_wnd_updated(sk, ssk); + } + + if (after64(new_snd_una, old_snd_una)) { + msk->snd_una = new_snd_una; + __mptcp_data_acked(sk); } + mptcp_data_unlock(sk); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -886,13 +935,30 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - if (__mptcp_check_fallback(msk)) + if (__mptcp_check_fallback(msk)) { + /* Keep it simple and unconditionally trigger send data cleanup and + * pending queue spooling. We will need to acquire the data lock + * for more accurate checks, and once the lock is acquired, such + * helpers are cheap. + */ + mptcp_data_lock(subflow->conn); + if (mptcp_send_head(subflow->conn)) + __mptcp_wnd_updated(subflow->conn, sk); + __mptcp_data_acked(subflow->conn); + mptcp_data_unlock(subflow->conn); return; + } mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; @@ -930,7 +996,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -975,7 +1041,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } } -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +static void mptcp_set_rwin(const struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + const struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + + ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + + if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); +} + +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { @@ -1014,44 +1097,66 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); - ptr += 1; + if ((OPTION_MPTCP_ADD_ADDR +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + | OPTION_MPTCP_ADD_ADDR6 +#endif + ) & opts->suboptions) { + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + u8 echo = MPTCP_ADDR_ECHO; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; +#endif + + if (opts->port) + len += TCPOLEN_MPTCP_PORT_LEN; + if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + len += sizeof(opts->ahmac); + echo = 0; } - } + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + len, echo, opts->addr_id); + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); - ptr += 4; - if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; } - } #endif + if (!opts->port) { + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } else { + if (opts->ahmac) { + u8 *bptr = (u8 *)ptr; + + put_unaligned_be16(opts->port, bptr); + bptr += 2; + put_unaligned_be64(opts->ahmac, bptr); + bptr += 8; + put_unaligned_be16(TCPOPT_NOP << 8 | + TCPOPT_NOP, bptr); + + ptr += 3; + } else { + put_unaligned_be32(opts->port << 16 | + TCPOPT_NOP << 8 | + TCPOPT_NOP, ptr); + ptr += 1; + } + } + } + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE, @@ -1132,4 +1237,7 @@ mp_capable_done: TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } + + if (tp) + mptcp_set_rwin(tp); } |