diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/ctrl.c | 68 | ||||
-rw-r--r-- | net/mptcp/mib.c | 2 | ||||
-rw-r--r-- | net/mptcp/mib.h | 2 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 1 | ||||
-rw-r--r-- | net/mptcp/options.c | 198 | ||||
-rw-r--r-- | net/mptcp/pm.c | 1 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 28 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 281 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 51 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 149 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 292 | ||||
-rw-r--r-- | net/mptcp/token.c | 9 |
12 files changed, 669 insertions, 413 deletions
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 96ba616f59bf..7d738bd06f2c 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -4,7 +4,9 @@ * Copyright (c) 2019, Tessares SA. */ +#ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#endif #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -15,10 +17,14 @@ static int mptcp_pernet_id; struct mptcp_pernet { +#ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; +#endif - int mptcp_enabled; + u8 mptcp_enabled; unsigned int add_addr_timeout; + u8 checksum_enabled; + u8 allow_join_initial_addr_port; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -36,15 +42,36 @@ unsigned int mptcp_get_add_addr_timeout(struct net *net) return mptcp_get_pernet(net)->add_addr_timeout; } +int mptcp_is_checksum_enabled(struct net *net) +{ + return mptcp_get_pernet(net)->checksum_enabled; +} + +int mptcp_allow_join_id0(struct net *net) +{ + return mptcp_get_pernet(net)->allow_join_initial_addr_port; +} + +static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) +{ + pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->checksum_enabled = 0; + pernet->allow_join_initial_addr_port = 1; +} + +#ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", @@ -52,15 +79,25 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "checksum_enabled", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "allow_join_initial_addr_port", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, {} }; -static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) -{ - pernet->mptcp_enabled = 1; - pernet->add_addr_timeout = TCP_RTO_MAX; -} - static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; @@ -75,6 +112,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; + table[2].data = &pernet->checksum_enabled; + table[3].data = &pernet->allow_join_initial_addr_port; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) @@ -100,6 +139,17 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) kfree(table); } +#else + +static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) +{ + return 0; +} + +static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} + +#endif /* CONFIG_SYSCTL */ + static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index eb2dc6dbe212..52ea2517e856 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -25,6 +25,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), + SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index f0da4f060fe1..193466c9b549 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -18,6 +18,8 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ + MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */ diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f16d9b5ee978..8f88ddeab6a2 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -144,6 +144,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 6b825fb3fa83..b5850afea343 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -44,7 +44,20 @@ static void mptcp_parse_option(const struct sk_buff *skb, else expected_opsize = TCPOLEN_MPTCP_MPC_SYN; } - if (opsize != expected_opsize) + + /* Cfr RFC 8684 Section 3.3.0: + * If a checksum is present but its use had + * not been negotiated in the MP_CAPABLE handshake, the receiver MUST + * close the subflow with a RST, as it is not behaving as negotiated. + * If a checksum is not present when its use has been negotiated, the + * receiver MUST close the subflow with a RST, as it is considered + * broken + * We parse even option with mismatching csum presence, so that + * later in subflow_data_ready we can trigger the reset. + */ + if (opsize != expected_opsize && + (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || + opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ @@ -66,16 +79,12 @@ static void mptcp_parse_option(const struct sk_buff *skb, * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." - * - * Section 3.3.0: - * "If a checksum is not present when its use has been - * negotiated, the receiver MUST close the subflow with a RST as - * it is considered broken." - * - * We don't implement DSS checksum - fall back to TCP. */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - break; + mp_opt->csum_reqd = 1; + + if (flags & MPTCP_CAP_DENY_JOIN_ID0) + mp_opt->deny_join_id0 = 1; mp_opt->mp_capable = 1; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { @@ -86,7 +95,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } - if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { + if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used @@ -98,9 +107,14 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", + if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum_reqd = 1; + ptr += 2; + } + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, - mp_opt->rcvr_key, mp_opt->data_len); + mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: @@ -171,10 +185,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } - /* RFC 6824, Section 3.3: - * If a checksum is present, but its use had - * not been negotiated in the MP_CAPABLE handshake, - * the checksum field MUST be ignored. + /* Always parse any csum presence combination, we will enforce + * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) @@ -209,9 +221,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", + if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { + mp_opt->csum_reqd = 1; + mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + ptr += 2; + } + + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len); + mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum); } break; @@ -323,9 +341,12 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; @@ -341,6 +362,8 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->dss = 0; mp_opt->mp_prio = 0; mp_opt->reset = 0; + mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); + mp_opt->deny_join_id0 = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -356,6 +379,8 @@ void mptcp_get_options(const struct sk_buff *skb, length--; continue; default: + if (length < 2) + return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; @@ -380,6 +405,8 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; + opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { @@ -435,8 +462,10 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; + u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than @@ -465,16 +494,27 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); + opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ - if (data_len > 0) - *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); - else + if (data_len > 0) { + len = TCPOLEN_MPTCP_MPC_ACK_DATA; + if (opts->csum_reqd) { + /* we need to propagate more info to csum the pseudo hdr */ + opts->ext_copy.data_seq = mpext->data_seq; + opts->ext_copy.subflow_seq = mpext->subflow_seq; + opts->ext_copy.csum = mpext->csum; + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + *size = ALIGN(len, 4); + } else { *size = TCPOLEN_MPTCP_MPC_ACK; + } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, @@ -535,18 +575,21 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; u64 ack_seq; + opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { - unsigned int map_size; + unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; - map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; + if (mpext) { + if (opts->csum_reqd) + map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; - remaining -= map_size; - dss_size = map_size; - if (mpext) opts->ext_copy = *mpext; + } + remaining -= map_size; + dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; @@ -789,6 +832,8 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; + opts->csum_reqd = subflow_req->csum_reqd; + opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); @@ -867,6 +912,9 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); @@ -894,19 +942,20 @@ reset: return false; } -static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { - u32 old_ack32, cur_ack32; - - if (use_64bit) - return cur_ack; - - old_ack32 = (u32)old_ack; - cur_ack32 = (u32)cur_ack; - cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; - if (unlikely(before(cur_ack32, old_ack32))) - return cur_ack + (1LL << 32); - return cur_ack; + u32 old_seq32, cur_seq32; + + old_seq32 = (u32)old_seq; + cur_seq32 = (u32)cur_seq; + cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; + if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) + return cur_seq + (1LL << 32); + + /* reverse wrap could happen, too */ + if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) + return cur_seq - (1LL << 32); + return cur_seq; } static void ack_update_msk(struct mptcp_sock *msk, @@ -924,7 +973,7 @@ static void ack_update_msk(struct mptcp_sock *msk, * more dangerous than missing an ack */ old_snd_una = msk->snd_una; - new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ if (after64(new_snd_una, snd_nxt)) @@ -961,7 +1010,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us return false; WRITE_ONCE(msk->rcv_data_fin_seq, - expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); + mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; @@ -1007,7 +1056,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return; } - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; @@ -1099,6 +1148,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; + mpext->csum_reqd = mp_opt.csum_reqd; + + if (mpext->csum_reqd) + mpext->csum = mp_opt.csum; } } @@ -1118,25 +1171,53 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + struct csum_pseudo_header header; + __wsum csum; + + /* cfr RFC 8684 3.3.1.: + * the data sequence number used in the pseudo-header is + * always the 64-bit value, irrespective of what length is used in the + * DSS option itself. + */ + header.data_seq = cpu_to_be64(mpext->data_seq); + header.subflow_seq = htonl(mpext->subflow_seq); + header.data_len = htons(mpext->data_len); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + return (__force u16)csum_fold(csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { - u8 len; + u8 len, flag = MPTCP_CAP_HMAC_SHA256; - if (OPTION_MPTCP_MPC_SYN & opts->suboptions) + if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; - else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) + } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - else if (opts->ext_copy.data_len) + } else if (opts->ext_copy.data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; - else + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } else { len = TCPOLEN_MPTCP_MPC_ACK; + } + + if (opts->csum_reqd) + flag |= MPTCP_CAP_CHECKSUM_REQD; + + if (!opts->allow_join_id0) + flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, - MPTCP_CAP_HMAC_SHA256); + flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -1152,8 +1233,13 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, if (!opts->ext_copy.data_len) goto mp_capable_done; - put_unaligned_be32(opts->ext_copy.data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + mptcp_make_csum(&opts->ext_copy), ptr); + } else { + put_unaligned_be32(opts->ext_copy.data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } ptr += 1; } @@ -1305,6 +1391,9 @@ mp_capable_done: flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); @@ -1324,8 +1413,13 @@ mp_capable_done: ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } } } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 9d00fa6d22e9..639271e09604 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -320,6 +320,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); + WRITE_ONCE(msk->pm.remote_deny_join_id0, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2469e06a3a9d..d2591ebf01d9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -451,7 +451,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check if should create a new subflow */ if (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.subflows < subflows_max && + !READ_ONCE(msk->pm.remote_deny_join_id0)) { local = select_local_address(pernet, msk); if (local) { struct mptcp_addr_info remote = { 0 }; @@ -540,6 +541,7 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for %s%s%s", @@ -547,9 +549,9 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); } } @@ -566,6 +568,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *sk = (struct sock *)msk; struct mptcp_addr_info local; + bool slow; local_address((struct sock_common *)ssk, &local); if (!addresses_equal(&local, addr, addr->port)) @@ -578,9 +581,9 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - lock_sock(ssk); + slow = lock_sock_fast(ssk); tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); spin_lock_bh(&msk->pm.lock); return 0; @@ -971,8 +974,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); - if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal when using port"); + return -EINVAL; + } entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + } return 0; } @@ -1913,10 +1922,13 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - __reset_counters(pernet); pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + return 0; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5edc686faff1..7a5afa8c6866 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -39,10 +39,15 @@ struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; + u8 has_rxtstamp:1; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +enum { + MPTCP_CMSG_TS = BIT(0), +}; + static struct percpu_counter mptcp_sockets_allocated; static void __mptcp_destroy_sock(struct sock *sk); @@ -272,6 +277,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; + bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); @@ -280,13 +286,17 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, /* try to fetch required memory from subflow */ if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - if (ssk->sk_forward_alloc < skb->truesize) - goto drop; - __sk_mem_reclaim(ssk, skb->truesize); - if (!sk_rmem_schedule(sk, skb, skb->truesize)) + int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; + + if (ssk->sk_forward_alloc < amount) goto drop; + + ssk->sk_forward_alloc -= amount; + sk->sk_forward_alloc += amount; } + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value @@ -294,6 +304,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -422,56 +433,55 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(ssk); + slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); - release_sock(ssk); + unlock_sock_fast(ssk, slow); } } -static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) { - int ret; + bool slow; - lock_sock(ssk); - ret = tcp_can_send_ack(ssk); - if (ret) + slow = lock_sock_fast(ssk); + if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); - release_sock(ssk); - return ret; + unlock_sock_fast(ssk, slow); +} + +static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) +{ + const struct inet_connection_sock *icsk = inet_csk(ssk); + u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); + const struct tcp_sock *tp = tcp_sk(ssk); + + return (ack_pending & ICSK_ACK_SCHED) && + ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > + READ_ONCE(icsk->icsk_ack.rcv_mss)) || + (rx_empty && ack_pending & + (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) { - struct sock *ack_hint = READ_ONCE(msk->ack_hint); int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - bool cleanup; + int space = __mptcp_space(sk); + bool cleanup, rx_empty; - /* this is a simple superset of what tcp_cleanup_rbuf() implements - * so that we don't have to acquire the ssk socket lock most of the time - * to do actually nothing - */ - cleanup = __mptcp_space(sk) - old_space >= max(0, old_space); - if (!cleanup) - return; + cleanup = (space > 0) && (space >= (old_space << 1)); + rx_empty = !atomic_read(&sk->sk_rmem_alloc); - /* if the hinted ssk is still active, try to use it */ - if (likely(ack_hint)) { - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) - return; - } + if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) + mptcp_subflow_cleanup_rbuf(ssk); } - - /* otherwise pick the first active subflow */ - mptcp_for_each_subflow(msk, subflow) - if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) - return; } static bool mptcp_check_data_fin(struct sock *sk) @@ -616,7 +626,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, break; } } while (more_data_avail); - WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; return done; @@ -668,18 +677,19 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - mptcp_data_lock(sk); - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); + if (unlikely(ssk->sk_err)) { + if (!sock_owned_by_user(sk)) + __mptcp_error_report(sk); + else + set_bit(MPTCP_ERROR_REPORT, &msk->flags); + } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but @@ -688,7 +698,7 @@ static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - mptcp_data_unlock(sk); + return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) @@ -696,7 +706,6 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; - bool wake; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -705,28 +714,22 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - /* move_skbs_to_msk below can legitly clear the data_avail flag, - * but we will need later to properly woke the reader, cache its - * value - */ - wake = subflow->data_avail == MPTCP_SUBFLOW_DATA_AVAIL; - if (wake) - set_bit(MPTCP_DATA_READY, &msk->flags); - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; - /* over limit? can't append more skbs to msk */ + /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) - goto wake; - - move_skbs_to_msk(msk, ssk); + return; -wake: - if (wake) + /* Wake-up the reader only for in-sequence data */ + mptcp_data_lock(sk); + if (move_skbs_to_msk(msk, ssk)) { + set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); + } + mptcp_data_unlock(sk); } static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) @@ -858,7 +861,7 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) sock_owned_by_me(sk); mptcp_for_each_subflow(msk, subflow) { - if (subflow->data_avail) + if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } @@ -894,22 +897,14 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(struct sock *sk, int size) +static int mptcp_wmem_with_overhead(int size) { - struct mptcp_sock *msk = mptcp_sk(sk); - int ret, skbs; - - ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); - skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; - if (skbs < msk->skb_tx_cache.qlen) - return ret; - - return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); + return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); } static void __mptcp_wmem_reserve(struct sock *sk, int size) { - int amount = mptcp_wmem_with_overhead(sk, size); + int amount = mptcp_wmem_with_overhead(size); struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); @@ -1204,49 +1199,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool mptcp_tx_cache_refill(struct sock *sk, int size, - struct sk_buff_head *skbs, int *total_ts) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sk_buff *skb; - int space_needed; - - if (unlikely(tcp_under_memory_pressure(sk))) { - mptcp_mem_reclaim_partial(sk); - - /* under pressure pre-allocate at most a single skb */ - if (msk->skb_tx_cache.qlen) - return true; - space_needed = msk->size_goal_cache; - } else { - space_needed = msk->tx_pending_data + size - - msk->skb_tx_cache.qlen * msk->size_goal_cache; - } - - while (space_needed > 0) { - skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); - if (unlikely(!skb)) { - /* under memory pressure, try to pass the caller a - * single skb to allow forward progress - */ - while (skbs->qlen > 1) { - skb = __skb_dequeue_tail(skbs); - *total_ts -= skb->truesize; - __kfree_skb(skb); - } - return skbs->qlen > 0; - } - - *total_ts += skb->truesize; - __skb_queue_tail(skbs, skb); - space_needed -= msk->size_goal_cache; - } - return true; -} - static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { - struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb; if (ssk->sk_tx_skb_cache) { @@ -1257,22 +1211,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return true; } - skb = skb_peek(&msk->skb_tx_cache); - if (skb) { - if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - skb = __skb_dequeue(&msk->skb_tx_cache); - if (WARN_ON_ONCE(!skb)) - return false; - - mptcp_wmem_uncharge(sk, skb->truesize); - ssk->sk_tx_skb_cache = skb; - return true; - } - - /* over memory limit, no point to try to allocate a new skb */ - return false; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return false; @@ -1288,7 +1226,6 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) { return !ssk->sk_tx_skb_cache && - !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && tcp_under_memory_pressure(sk); } @@ -1299,6 +1236,18 @@ static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); } +/* note: this always recompute the csum on the whole skb, even + * if we just appended a single frag. More status info needed + */ +static void mptcp_update_data_checksum(struct sk_buff *skb, int added) +{ + struct mptcp_ext *mpext = mptcp_get_ext(skb); + __wsum csum = ~csum_unfold(mpext->csum); + int offset = skb->len - added; + + mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1319,7 +1268,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); avail_size = info->size_goal; - msk->size_goal_cache = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb) { /* Limit the write to the size available in the @@ -1393,10 +1341,14 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mpext->frozen = 1; - ret = 0; + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); tcp_push_pending_frames(ssk); + return 0; } out: + if (READ_ONCE(msk->csum_enabled)) + mptcp_update_data_checksum(tail, ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } @@ -1664,7 +1616,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; - struct sk_buff_head skbs; bool dfrag_collapsed; size_t psize, offset; @@ -1697,16 +1648,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - __skb_queue_head_init(&skbs); - if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) - goto wait_for_memory; - if (!mptcp_wmem_alloc(sk, total_ts)) { - __skb_queue_purge(&skbs); + if (!mptcp_wmem_alloc(sk, total_ts)) goto wait_for_memory; - } - skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { mptcp_wmem_uncharge(sk, psize + frag_truesize); @@ -1763,7 +1708,7 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, timeo, - test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait); + test_bit(MPTCP_DATA_READY, &msk->flags), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1771,7 +1716,9 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len, int flags) + size_t len, int flags, + struct scm_timestamping_internal *tss, + int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; @@ -1791,6 +1738,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } } + if (MPTCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, tss); + *cmsg_flags |= MPTCP_CMSG_TS; + } + copied += count; if (count < data_len) { @@ -1954,7 +1906,9 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); - tcp_cleanup_rbuf(ssk, moved); + + if (unlikely(ssk->sk_err)) + __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); @@ -1967,7 +1921,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); - mptcp_cleanup_rbuf(msk); } if (ret) mptcp_check_data_fin((struct sock *)msk); @@ -1978,7 +1931,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); - int copied = 0; + struct scm_timestamping_internal tss; + int copied = 0, cmsg_flags = 0; int target; long timeo; @@ -2000,7 +1954,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2076,11 +2030,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, */ if (unlikely(__mptcp_move_skbs(msk))) set_bit(MPTCP_DATA_READY, &msk->flags); - } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { - /* data to read but mptcp_wait_data() cleared DATA_READY */ - set_bit(MPTCP_DATA_READY, &msk->flags); } + out_err: + if (cmsg_flags && copied >= 0) { + if (cmsg_flags & MPTCP_CMSG_TS) + tcp_recv_timestamp(msg, sk, &tss); + } + pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); @@ -2212,9 +2169,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk == msk->last_snd) msk->last_snd = NULL; - if (ssk == msk->ack_hint) - msk->ack_hint = NULL; - if (ssk == msk->first) msk->first = NULL; @@ -2286,13 +2240,14 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + bool slow; - lock_sock(tcp_sk); + slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { tcp_send_active_reset(tcp_sk, GFP_ATOMIC); tcp_set_state(tcp_sk, TCP_CLOSE); } - release_sock(tcp_sk); + unlock_sock_fast(tcp_sk, slow); } inet_sk_state_store(sk, TCP_CLOSE); @@ -2337,8 +2292,8 @@ static void __mptcp_retrans(struct sock *sk) /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; - info.limit = dfrag->already_sent; - while (info.sent < dfrag->already_sent) { + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + while (info.sent < info.limit) { if (!mptcp_alloc_tx_skb(sk, ssk)) break; @@ -2350,9 +2305,11 @@ static void __mptcp_retrans(struct sock *sk) copied += ret; info.sent += ret; } - if (copied) + if (copied) { + dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + } mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -2420,17 +2377,15 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); - __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->wmem_reserved = 0; msk->rmem_released = 0; msk->tx_pending_data = 0; - msk->size_goal_cache = TCP_BASE_MSS; - msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_init(msk); @@ -2482,15 +2437,10 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - struct sk_buff *skb; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); - while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { - sk->sk_forward_alloc += skb->truesize; - kfree_skb(skb); - } } static void mptcp_cancel_work(struct sock *sk) @@ -2771,6 +2721,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); + if (mp_opt->csum_reqd) + WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; msk->snd_nxt = msk->write_seq; @@ -2944,6 +2896,11 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } + /* be sure to set the current sk state before tacking actions + * depending on sk_state + */ + if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) + __mptcp_set_connected(sk); if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) __mptcp_clean_una_wakeup(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0c6f99c67345..426ed80fe72f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -68,6 +68,8 @@ #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) + /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_HMAC_LEN 20 @@ -77,8 +79,9 @@ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) +#define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) -#define MPTCP_CAP_FLAG_MASK (0x3F) +#define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) @@ -109,6 +112,7 @@ #define MPTCP_ERROR_REPORT 8 #define MPTCP_RETRANSMIT 9 #define MPTCP_WORK_SYNC_SETSOCKOPT 10 +#define MPTCP_CONNECTED 11 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -124,6 +128,7 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; + __sum16 csum; u16 mp_capable : 1, mp_join : 1, fastclose : 1, @@ -133,7 +138,9 @@ struct mptcp_options_received { rm_addr : 1, mp_prio : 1, echo : 1, - backup : 1; + csum_reqd : 1, + backup : 1, + deny_join_id0 : 1; u32 token; u32 nonce; u64 thmac; @@ -188,6 +195,7 @@ struct mptcp_pm_data { bool work_pending; bool accept_addr; bool accept_subflow; + bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; @@ -234,15 +242,13 @@ struct mptcp_sock { bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ + bool csum_enabled; spinlock_t join_list_lock; - struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ int tx_pending_data; - int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -335,11 +341,20 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } +struct csum_pseudo_header { + __be64 data_seq; + __be32 subflow_seq; + __be16 data_len; + __sum16 csum; +}; + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, - backup : 1; + backup : 1, + csum_reqd : 1, + allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; @@ -362,7 +377,6 @@ mptcp_subflow_rsk(const struct request_sock *rsk) enum mptcp_data_avail { MPTCP_SUBFLOW_NODATA, MPTCP_SUBFLOW_DATA_AVAIL, - MPTCP_SUBFLOW_OOO_DATA }; struct mptcp_delegated_action { @@ -387,6 +401,8 @@ struct mptcp_subflow_context { u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; + __wsum map_data_csum; + u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, @@ -396,6 +412,8 @@ struct mptcp_subflow_context { pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, + map_csum_reqd : 1, + map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, @@ -525,6 +543,8 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su int mptcp_is_enabled(struct net *net); unsigned int mptcp_get_add_addr_timeout(struct net *net); +int mptcp_is_checksum_enabled(struct net *net); +int mptcp_allow_join_id0(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); @@ -576,10 +596,12 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sk_buff *skb, +void mptcp_get_options(const struct sock *sk, + const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void __mptcp_set_connected(struct sock *sk); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && @@ -594,6 +616,14 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); +static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) +{ + if (use_64bit) + return cur_seq; + + return __mptcp_expand_seq(old_seq, cur_seq); +} void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); @@ -627,6 +657,8 @@ static inline void mptcp_write_space(struct sock *sk) void mptcp_destroy_common(struct mptcp_sock *msk); +#define MPTCP_TOKEN_MAX_RETRIES 4 + void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { @@ -753,9 +785,6 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); - void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_all(struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a79798189599..092d1f635d27 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -140,6 +140,43 @@ static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); } +static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + sock_set_timestamp(sk, optname, !!val); + break; + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: + sock_set_timestamping(sk, optname, val); + break; + } + + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -164,6 +201,13 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, case SO_INCOMING_CPU: mptcp_so_incoming_cpu(msk, val); return 0; + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val); } return -ENOPROTOOPT; @@ -251,9 +295,23 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_MARK: case SO_INCOMING_CPU: case SO_DEBUG: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); case SO_LINGER: return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + /* No need to copy: only relevant for msk */ + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); case SO_NO_CHECK: case SO_DONTROUTE: case SO_BROADCAST: @@ -267,7 +325,24 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return 0; } - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); + /* SO_OOBINLINE is not supported, let's avoid the related mess + * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + * + * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + * + * SO_PEEK_OFF is unsupported, as it is for plain TCP + * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows + * SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + * + * SO_ZEROCOPY is currently unsupported, TODO in sndmsg + * SO_TXTIME is currently unsupported + */ + + return -EOPNOTSUPP; } static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, @@ -299,72 +374,6 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, static bool mptcp_supported_sockopt(int level, int optname) { - if (level == SOL_SOCKET) { - switch (optname) { - case SO_DEBUG: - case SO_REUSEPORT: - case SO_REUSEADDR: - - /* the following ones need a better implementation, - * but are quite common we want to preserve them - */ - case SO_BINDTODEVICE: - case SO_SNDBUF: - case SO_SNDBUFFORCE: - case SO_RCVBUF: - case SO_RCVBUFFORCE: - case SO_KEEPALIVE: - case SO_PRIORITY: - case SO_LINGER: - case SO_TIMESTAMP_OLD: - case SO_TIMESTAMP_NEW: - case SO_TIMESTAMPNS_OLD: - case SO_TIMESTAMPNS_NEW: - case SO_TIMESTAMPING_OLD: - case SO_TIMESTAMPING_NEW: - case SO_RCVLOWAT: - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - case SO_MARK: - case SO_INCOMING_CPU: - case SO_BINDTOIFINDEX: - case SO_BUSY_POLL: - case SO_PREFER_BUSY_POLL: - case SO_BUSY_POLL_BUDGET: - - /* next ones are no-op for plain TCP */ - case SO_NO_CHECK: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_BSDCOMPAT: - case SO_PASSCRED: - case SO_PASSSEC: - case SO_RXQ_OVFL: - case SO_WIFI_STATUS: - case SO_NOFCS: - case SO_SELECT_ERR_QUEUE: - return true; - } - - /* SO_OOBINLINE is not supported, let's avoid the related mess */ - /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, - * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, - * we must be careful with subflows - */ - /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks - * explicitly the sk_protocol field - */ - /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ - /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ - /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, - * but likely needs careful design - */ - /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ - /* SO_TXTIME is currently unsupported */ - return false; - } if (level == SOL_IP) { switch (optname) { /* should work fine */ @@ -574,12 +583,12 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, pr_debug("msk=%p", msk); - if (!mptcp_supported_sockopt(level, optname)) - return -ENOPROTOOPT; - if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the * MPTCP-level socket to configure the subflows until the subflow diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ef3d037f984a..66d0b1893d26 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -108,6 +108,8 @@ static void subflow_init_req(struct request_sock *req, const struct sock *sk_lis subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); + subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } @@ -150,7 +152,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -162,7 +164,7 @@ static int subflow_check_req(struct request_sock *req, } if (mp_opt.mp_capable && listener->request_mptcp) { - int err, retries = 4; + int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: @@ -247,7 +249,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk_listener, skb, &mp_opt); if (mp_opt.mp_capable && mp_opt.mp_join) return -EINVAL; @@ -371,6 +373,24 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } +void __mptcp_set_connected(struct sock *sk) +{ + if (sk->sk_state == TCP_SYN_SENT) { + inet_sk_state_store(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + } +} + +static void mptcp_set_connected(struct sock *sk) +{ + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_set_connected(sk); + else + set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -379,10 +399,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) @@ -394,7 +410,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { if (!mp_opt.mp_capable) { MPTCP_INC_STATS(sock_net(sk), @@ -404,6 +420,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } + if (mp_opt.csum_reqd) + WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); + if (mp_opt.deny_join_id0) + WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = mp_opt.sndr_key; @@ -411,6 +431,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -430,15 +451,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto do_reset; } + if (!mptcp_finish_join(sk)) + goto do_reset; + subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); - if (!mptcp_finish_join(sk)) - goto do_reset; - subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); @@ -451,6 +472,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_set_connected(parent); } return; @@ -558,6 +580,7 @@ static void mptcp_sock_destruct(struct sock *sk) static void mptcp_force_close(struct sock *sk) { + /* the msk is not yet exposed to user-space */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); } @@ -638,7 +661,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_capable) { fallback = true; goto create_child; @@ -648,7 +671,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(skb, &mp_opt); + mptcp_get_options(sk, skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); @@ -775,19 +798,10 @@ enum mapping_status { MAPPING_DUMMY }; -static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) -{ - if ((u32)seq == (u32)old_seq) - return old_seq; - - /* Assume map covers data not mapped yet. */ - return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); -} - -static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) +static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d", - ssn, subflow->map_subflow_seq, subflow->map_data_len); + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) @@ -812,22 +826,104 @@ static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) /* Mapping covers data later in the subflow stream, * currently unsupported. */ - warn_bad_map(subflow, ssn); + dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ - warn_bad_map(subflow, ssn + skb->len); + dbg_bad_map(subflow, ssn); return false; } return true; } +static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, + bool csum_reqd) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct csum_pseudo_header header; + u32 offset, seq, delta; + __wsum csum; + int len; + + if (!csum_reqd) + return MAPPING_OK; + + /* mapping already validated on previous traversal */ + if (subflow->map_csum_len == subflow->map_data_len) + return MAPPING_OK; + + /* traverse the receive queue, ensuring it contains a full + * DSS mapping and accumulating the related csum. + * Preserve the accoumlate csum across multiple calls, to compute + * the csum only once + */ + delta = subflow->map_data_len - subflow->map_csum_len; + for (;;) { + seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; + offset = seq - TCP_SKB_CB(skb)->seq; + + /* if the current skb has not been accounted yet, csum its contents + * up to the amount covered by the current DSS + */ + if (offset < skb->len) { + __wsum csum; + + len = min(skb->len - offset, delta); + csum = skb_checksum(skb, offset, len, 0); + subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, + subflow->map_csum_len); + + delta -= len; + subflow->map_csum_len += len; + } + if (delta == 0) + break; + + if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { + /* if this subflow is closed, the partial mapping + * will be never completed; flush the pending skbs, so + * that subflow_sched_work_if_closed() can kick in + */ + if (unlikely(ssk->sk_state == TCP_CLOSE)) + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + + /* not enough data to validate the csum */ + return MAPPING_EMPTY; + } + + /* the DSS mapping for next skbs will be validated later, + * when a get_mapping_status call will process such skb + */ + skb = skb->next; + } + + /* note that 'map_data_len' accounts only for the carried data, does + * not include the eventual seq increment due to the data fin, + * while the pseudo header requires the original DSS data len, + * including that + */ + header.data_seq = cpu_to_be64(subflow->map_seq); + header.subflow_seq = htonl(subflow->map_subflow_seq); + header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); + header.csum = 0; + + csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); + if (unlikely(csum_fold(csum))) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); + return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + } + + return MAPPING_OK; +} + static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -907,22 +1003,17 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len--; } - if (!mpext->dsn64) { - map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, - mpext->data_seq); - pr_debug("expanded seq=%llu", subflow->map_seq); - } else { - map_seq = mpext->data_seq; - } + map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && - subflow->map_data_len == data_len) { + subflow->map_data_len == data_len && + subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + goto validate_csum; } /* If this skb data are fully covered by the current mapping, @@ -934,27 +1025,41 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } /* will validate the next map after consuming the current one */ - return MAPPING_OK; + goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; + subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u", + subflow->map_csum_reqd = mpext->csum_reqd; + subflow->map_csum_len = 0; + subflow->map_data_csum = csum_unfold(mpext->csum); + + /* Cfr RFC 8684 Section 3.3.0 */ + if (unlikely(subflow->map_csum_reqd != csum_reqd)) + return MAPPING_INVALID; + + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", subflow->map_seq, subflow->map_subflow_seq, - subflow->map_data_len); + subflow->map_data_len, subflow->map_csum_reqd, + subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ - if (!validate_mapping(ssk, skb)) + if (!validate_mapping(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; + } skb_ext_del(skb, SKB_EXT_MPTCP); - return MAPPING_OK; + +validate_csum: + return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, @@ -1000,7 +1105,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); if (subflow->data_avail) return true; @@ -1039,18 +1144,13 @@ static bool subflow_check_data_avail(struct sock *ssk) ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, ack_seq); - if (ack_seq == old_ack) { - subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; - break; - } else if (after64(ack_seq, old_ack)) { - subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA; - break; + if (unlikely(before64(ack_seq, old_ack))) { + mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); + continue; } - /* only accept in-sequence mapping. Old values are spurious - * retransmission - */ - mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + break; } return true; @@ -1065,12 +1165,11 @@ fallback: * subflow_error_report() will introduce the appropriate barriers */ ssk->sk_err = EBADMSG; - ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); return false; } @@ -1080,7 +1179,7 @@ fallback: subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; - subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL; + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } @@ -1092,7 +1191,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; - subflow->data_avail = 0; + WRITE_ONCE(subflow->data_avail, 0); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, @@ -1120,41 +1219,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) *full_space = tcp_full_space(sk); } -static void subflow_data_ready(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - u16 state = 1 << inet_sk_state_load(sk); - struct sock *parent = subflow->conn; - struct mptcp_sock *msk; - - msk = mptcp_sk(parent); - if (state & TCPF_LISTEN) { - /* MPJ subflow are removed from accept queue before reaching here, - * avoid stray wakeups - */ - if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) - return; - - set_bit(MPTCP_DATA_READY, &msk->flags); - parent->sk_data_ready(parent); - return; - } - - WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && - !subflow->mp_join && !(state & TCPF_CLOSE)); - - if (mptcp_subflow_data_available(sk)) - mptcp_data_ready(parent, sk); -} - -static void subflow_write_space(struct sock *ssk) -{ - struct sock *sk = mptcp_subflow_ctx(ssk)->conn; - - mptcp_propagate_sndbuf(sk, ssk); - mptcp_write_space(sk); -} - void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; @@ -1178,7 +1242,7 @@ void __mptcp_error_report(struct sock *sk) /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); break; } } @@ -1195,6 +1259,43 @@ static void subflow_error_report(struct sock *ssk) mptcp_data_unlock(sk); } +static void subflow_data_ready(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + u16 state = 1 << inet_sk_state_load(sk); + struct sock *parent = subflow->conn; + struct mptcp_sock *msk; + + msk = mptcp_sk(parent); + if (state & TCPF_LISTEN) { + /* MPJ subflow are removed from accept queue before reaching here, + * avoid stray wakeups + */ + if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) + return; + + set_bit(MPTCP_DATA_READY, &msk->flags); + parent->sk_data_ready(parent); + return; + } + + WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && + !subflow->mp_join && !(state & TCPF_CLOSE)); + + if (mptcp_subflow_data_available(sk)) + mptcp_data_ready(parent, sk); + else if (unlikely(sk->sk_err)) + subflow_error_report(sk); +} + +static void subflow_write_space(struct sock *ssk) +{ + struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + + mptcp_propagate_sndbuf(sk, ssk); + mptcp_write_space(sk); +} + static struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { @@ -1493,10 +1594,7 @@ static void subflow_state_change(struct sock *sk) mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } + mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection @@ -1505,6 +1603,8 @@ static void subflow_state_change(struct sock *sk) */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); + else if (unlikely(sk->sk_err)) + subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 8f0270a780ce..a98e554b034f 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -33,7 +33,6 @@ #include <net/mptcp.h> #include "protocol.h" -#define TOKEN_MAX_RETRIES 4 #define TOKEN_MAX_CHAIN_LEN 4 struct token_bucket { @@ -153,12 +152,9 @@ int mptcp_token_new_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - int retries = TOKEN_MAX_RETRIES; + int retries = MPTCP_TOKEN_MAX_RETRIES; struct token_bucket *bucket; - pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", - sk, subflow->local_key, subflow->token, subflow->idsn); - again: mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, &subflow->idsn); @@ -172,6 +168,9 @@ again: goto again; } + pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", + sk, subflow->local_key, subflow->token, subflow->idsn); + WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; |