diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 502 |
1 files changed, 267 insertions, 235 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3613489eb6e3..933b257eee02 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -67,11 +67,11 @@ static bool mptcp_is_tcpsk(struct sock *sk) * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ - sock->ops = &inet_stream_ops; + WRITE_ONCE(sock->ops, &inet_stream_ops); return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - sock->ops = &inet6_stream_ops; + WRITE_ONCE(sock->ops, &inet6_stream_ops); return true; #endif } @@ -90,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; + msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); - WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -101,6 +101,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* This is the first subflow, always with id 0 */ subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); + iput(SOCK_INODE(ssock)); return 0; } @@ -108,7 +109,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ -struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) +struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; @@ -116,10 +117,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); - if (!msk->subflow) { - if (msk->first) - return ERR_PTR(-EINVAL); - + if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); @@ -127,7 +125,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) mptcp_sockopt_sync(msk, msk->first); } - return msk->subflow; + return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) @@ -1368,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; @@ -1379,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) { - if (!msk->first) - return NULL; - return __tcp_can_send(msk->first) && - sk_stream_memory_free(msk->first) ? msk->first : NULL; - } - - /* re-use last subflow, if the burst allow that */ - if (msk->last_snd && msk->snd_burst > 0 && - sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_set_timeout(sk); - return msk->last_snd; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1448,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); - if (!burst) { - msk->last_snd = NULL; + if (!burst) return ssk; - } subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); - msk->last_snd = ssk; msk->snd_burst = burst; return ssk; } @@ -1501,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } -void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static int __subflow_push_pending(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info = { - .flags = flags, - }; - bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len; + int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; + info->sent = dfrag->already_sent; + info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); - - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; - - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); + err = copied ? : ret; goto out; } - do_check_data_fin = true; - info.sent += ret; + info->sent += ret; + copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + if (msk->snd_burst <= 0 || + !sk_stream_memory_free(ssk) || + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { + err = copied; + goto out; + } + mptcp_set_timeout(sk); + } + err = copied; + +out: + return err; +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + bool do_check_data_fin = false; + int push_count = 1; + + while (mptcp_send_head(sk) && (push_count > 0)) { + struct mptcp_subflow_context *subflow; + int ret = 0; + + if (mptcp_sched_get_send(msk)) + break; + + push_count = 0; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + + prev_ssk = ssk; + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk != prev_ssk) { + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (prev_ssk) + mptcp_push_release(prev_ssk, &info); + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + lock_sock(ssk); + } + + push_count++; + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret != -EAGAIN || + (1 << ssk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) + push_count--; + continue; + } + do_check_data_fin = true; + } + } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); -out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -1572,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; - struct mptcp_data_frag *dfrag; + bool keep_pushing = true; struct sock *xmit_ssk; - int len, copied = 0; + int copied = 0; info.flags = 0; - while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; - len = dfrag->data_len - dfrag->already_sent; - while (len > 0) { - int ret = 0; + while (mptcp_send_head(sk) && keep_pushing) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + int ret = 0; - /* check for a different subflow usage only after - * spooling the first chunk of data - */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); - goto out; - } - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + /* check for a different subflow usage only after + * spooling the first chunk of data + */ + if (first) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + first = false; if (ret <= 0) - goto out; + break; + copied += ret; + continue; + } + + if (mptcp_sched_get_send(msk)) + goto out; - info.sent += ret; + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) + keep_pushing = false; copied += ret; - len -= ret; - first = false; + } - mptcp_update_post_push(msk, dfrag, ret); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + xmit_ssk = mptcp_subflow_tcp_sock(subflow); + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(subflow, + MPTCP_DELEGATE_SEND); + keep_pushing = false; + } + } } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } out: @@ -1642,7 +1669,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; struct sock *ssk; int ret; @@ -1653,9 +1679,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; @@ -1689,7 +1715,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (!mptcp_disconnect(sk, 0)) sk->sk_socket->state = SS_UNCONNECTED; } - inet_sk(sk)->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); return ret; } @@ -1707,7 +1733,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); - if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) { + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || + msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); @@ -1881,6 +1908,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; @@ -1911,9 +1939,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); + scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; + msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; @@ -1922,8 +1952,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; @@ -1932,18 +1962,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < advmss) - rcvmem += 128; - - do_div(rcvwin, advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = tcp_win_from_space(sk, rcvbuf); + window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -2202,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) - return NULL; - mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2243,14 +2263,6 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) return min_stale_count > 1 ? backup : NULL; } -static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) -{ - if (msk->subflow) { - iput(SOCK_INODE(msk->subflow)); - WRITE_ONCE(msk->subflow, NULL); - } -} - bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; @@ -2329,13 +2341,13 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out_release; } - dispose_it = !msk->subflow || ssk != msk->subflow->sk; + dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) { + if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_disconnect() path, * to generate the egress reset */ @@ -2350,7 +2362,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); - msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2383,9 +2394,6 @@ out_release: WRITE_ONCE(msk->first, NULL); out: - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (need_push) __mptcp_push_pending(sk, 0); } @@ -2502,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - size_t copied = 0; struct sock *ssk; - int ret; + int ret, err; + u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ - ssk = mptcp_subflow_get_retrans(msk); + err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2530,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - if (!ssk) + if (err) goto reset_timer; - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + u16 copied = 0; - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; - while (info.sent < info.limit) { - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; + mptcp_subflow_set_scheduled(subflow, false); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) { - dfrag->already_sent = max(dfrag->already_sent, info.sent); - msk->bytes_retrans += copied; - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : + dfrag->already_sent; + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) { + len = max(copied, len); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } + + release_sock(ssk); + } } - release_sock(ssk); + msk->bytes_retrans += len; + dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); @@ -2663,7 +2685,7 @@ unlock: sock_put(sk); } -static int __mptcp_init_sock(struct sock *sk) +static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2690,8 +2712,6 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); - - return 0; } static void mptcp_ca_reset(struct sock *sk) @@ -2711,9 +2731,7 @@ static int mptcp_init_sock(struct sock *sk) struct net *net = sock_net(sk); int ret; - ret = __mptcp_init_sock(sk); - if (ret) - return ret; + __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; @@ -2721,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; + ret = mptcp_init_sched(mptcp_sk(sk), + mptcp_sched_find(mptcp_get_scheduler(net))); + if (ret) + return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2866,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + mptcp_release_sched(msk); sk->sk_prot->destroy(sk); @@ -3055,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); - msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->push_pending = 0; @@ -3111,7 +3134,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - WRITE_ONCE(msk->subflow, NULL); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) @@ -3122,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; @@ -3175,25 +3198,17 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } -static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, +static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, bool kern) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *listener; struct sock *newsk; - listener = READ_ONCE(msk->subflow); - if (WARN_ON_ONCE(!listener)) { - *err = -EINVAL; - return NULL; - } - - pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); - newsk = inet_csk_accept(listener->sk, flags, err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, err, kern); if (!newsk) return NULL; - pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -3210,9 +3225,9 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, } newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { - MPTCP_INC_STATS(sock_net(sk), + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } @@ -3253,10 +3268,8 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - /* clears msk->subflow, allowing the following to close - * even the initial subflow - */ - mptcp_dispose_initial_subflow(msk); + /* allow the following to close even the initial subflow */ + msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } @@ -3328,7 +3341,7 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (unlikely(&msk->cb_flags)) { + if (unlikely(msk->cb_flags)) { /* be sure to set the current sk state before tacking actions * depending on sk_state, that is processing MPTCP_ERROR_REPORT */ @@ -3336,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) - msk->last_snd = NULL; } __mptcp_update_rmem(sk); @@ -3406,14 +3417,12 @@ static void mptcp_unhash(struct sock *sk) static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; - ssock = msk->subflow; - pr_debug("msk=%p, subflow=%p", msk, ssock); - if (WARN_ON_ONCE(!ssock)) + pr_debug("msk=%p, ssk=%p", msk, msk->first); + if (WARN_ON_ONCE(!msk->first)) return -EINVAL; - return inet_csk_get_port(ssock->sk, snum); + return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) @@ -3588,25 +3597,24 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int err = -EINVAL; + struct sock *ssk; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); + subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } if (likely(!__mptcp_check_fallback(msk))) @@ -3615,25 +3623,42 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); - else - err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); - inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + if (!msk->fastopening) + lock_sock(ssk); + + /* the following mirrors closely a very small chunk of code from + * __inet_stream_connect() + */ + if (ssk->sk_state != TCP_CLOSE) + goto out; + + if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { + err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); + if (err) + goto out; + } + + err = ssk->sk_prot->connect(ssk, uaddr, addr_len); + if (err < 0) + goto out; + + inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); + +out: + if (!msk->fastopening) + release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ - if (unlikely(err && err != -EINPROGRESS)) { - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + if (unlikely(err)) { + /* avoid leaving a dangling token in an unconnected socket */ + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_CLOSE); return err; } - mptcp_copy_inaddrs(sk, ssock->sk); - - /* silence EINPROGRESS and let the caller inet_stream_connect - * handle the connection in progress - */ + mptcp_copy_inaddrs(sk, ssk); return 0; } @@ -3674,22 +3699,27 @@ static struct proto mptcp_prot = { static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct socket *ssock; - int err; + struct sock *ssk, *sk = sock->sk; + int err = -EINVAL; - lock_sock(sock->sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + lock_sock(sk); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - err = ssock->ops->bind(ssock, uaddr, addr_len); + if (sk->sk_family == AF_INET) + err = inet_bind_sk(ssk, uaddr, addr_len); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (sk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, uaddr, addr_len); +#endif if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + mptcp_copy_inaddrs(sk, ssk); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } @@ -3697,7 +3727,7 @@ static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; - struct socket *ssock; + struct sock *ssk; int err; pr_debug("msk=%p", msk); @@ -3708,25 +3738,26 @@ static int mptcp_listen(struct socket *sock, int backlog) if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); - err = ssock->ops->listen(ssock, backlog); - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + lock_sock(ssk); + err = __inet_listen_sk(ssk, backlog); + release_sock(ssk); + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - mptcp_copy_inaddrs(sk, ssock->sk); + mptcp_copy_inaddrs(sk, ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } - mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); - unlock: release_sock(sk); return err; @@ -3736,8 +3767,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct socket *ssock; - struct sock *newsk; + struct sock *ssk, *newsk; int err; pr_debug("msk=%p", msk); @@ -3745,11 +3775,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ - ssock = READ_ONCE(msk->subflow); - if (!ssock) + ssk = READ_ONCE(msk->first); + if (!ssk) return -EINVAL; - newsk = mptcp_accept(sock->sk, flags, &err, kern); + newsk = mptcp_accept(ssk, flags, &err, kern); if (!newsk) return err; @@ -3776,11 +3806,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ - if (msk->first && - unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); - if (unlikely(list_empty(&msk->conn_list))) + if (unlikely(list_is_singular(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } } @@ -3819,12 +3848,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - struct socket *ssock = READ_ONCE(msk->subflow); + struct sock *ssk = READ_ONCE(msk->first); - if (WARN_ON_ONCE(!ssock || !ssock->sk)) + if (WARN_ON_ONCE(!ssk)) return 0; - return inet_csk_listen_poll(ssock->sk); + return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); @@ -3839,7 +3868,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); - } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } @@ -3935,6 +3965,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) @@ -3988,6 +4019,7 @@ int __init mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) |