diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 161 |
1 files changed, 119 insertions, 42 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f8897a70c11d..f599ad44ed24 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -662,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { - /* if no data is found, a racing workqueue/recvmsg - * already processed the new data, stop here or we - * can enter an infinite loop + /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), + * a different CPU can have already processed the pending + * data, stop here or we can enter an infinite loop */ if (!moved) done = true; @@ -672,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } if (__mptcp_check_fallback(msk)) { - /* if we are running under the workqueue, TCP could have - * collapsed skbs between dummy map creation and now - * be sure to adjust the size + /* Under fallback skbs have no MPTCP extension and TCP could + * collapse them between the dummy map creation and the + * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; @@ -1544,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; + bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len, copied = 0; + int len; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1580,8 +1581,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) goto out; } + do_check_data_fin = true; info.sent += ret; - copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); @@ -1597,7 +1598,7 @@ out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - if (copied) + if (do_check_data_fin) __mptcp_check_send_data_fin(sk); } @@ -1676,6 +1677,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; + struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1689,14 +1691,39 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + struct sock *ssk = ssock->sk; + int copied_syn = 0; + + lock_sock(ssk); + + ret = tcp_sendmsg_fastopen(ssk, msg, &copied_syn, len, NULL); + copied += copied_syn; + if (ret == -EINPROGRESS && copied_syn > 0) { + /* reflect the new state on the MPTCP socket */ + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + release_sock(ssk); + goto out; + } else if (ret) { + release_sock(ssk); + goto do_error; + } + release_sock(ssk); + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) - goto out; + goto do_error; } + ret = -EPIPE; + if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) + goto do_error; + pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { @@ -1705,11 +1732,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) bool dfrag_collapsed; size_t psize, offset; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ @@ -1741,7 +1763,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { ret = -EFAULT; - goto out; + goto do_error; } /* data successfully copied into the write queue */ @@ -1773,7 +1795,7 @@ wait_for_memory: __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) - goto out; + goto do_error; } if (copied) @@ -1781,7 +1803,14 @@ wait_for_memory: out: release_sock(sk); - return copied ? : ret; + return copied; + +do_error: + if (copied) + goto out; + + copied = sk_stream_error(sk, msg->msg_flags, ret); + goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -2284,8 +2313,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) + if (flags & MPTCP_CF_FASTCLOSE) { + /* be sure to force the tcp_disconnect() path, + * to generate the egress reset + */ + ssk->sk_lingertime = 0; + sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; + } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2363,7 +2398,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) might_sleep(); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2406,7 +2441,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_token_destroy(msk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; @@ -2418,12 +2453,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) unlock_sock_fast(tcp_sk, slow); } + /* Mirror the tcp_reset() error propagation */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); - mptcp_close_wake_up(sk); + /* the calling mptcp_worker will properly destroy the socket */ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) @@ -2529,6 +2583,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) mptcp_reset_timeout(msk, 0); } +static void mptcp_do_fastclose(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), + subflow, MPTCP_CF_FASTCLOSE); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2557,11 +2621,15 @@ static void mptcp_worker(struct work_struct *work) * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ - if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { - inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_destroy_sock(sk); - goto unlock; + if (sock_flag(sk, SOCK_DEAD)) { + if (mptcp_check_close_timeout(sk)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; + } } if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2802,6 +2870,18 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; @@ -2815,8 +2895,13 @@ bool __mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_close_state(sk)) + if (mptcp_check_readable(msk)) { + /* the msk has read data, do the MPTCP equivalent of TCP reset */ + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); + } sk_stream_wait_close(sk, timeout); @@ -3063,7 +3148,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) + mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ @@ -3535,6 +3620,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); + inet_sk(sock->sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; sock->state = ssock->state; /* on successful connect, the msk state will be moved to established by @@ -3632,18 +3718,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, return err; } -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; -} - static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -3685,13 +3759,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* cf tcp_poll() note about TFO */ + mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (sk->sk_err) mask |= EPOLLERR; |