diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 476 |
1 files changed, 314 insertions, 162 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 54613f5b7521..f60f01b14fac 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -22,6 +22,7 @@ #endif #include <net/mptcp.h> #include <net/xfrm.h> +#include <asm/ioctls.h> #include "protocol.h" #include "mib.h" @@ -46,9 +47,10 @@ struct mptcp_skb_cb { enum { MPTCP_CMSG_TS = BIT(0), + MPTCP_CMSG_INQ = BIT(1), }; -static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk); @@ -738,6 +740,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; + MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->ack_seq = end_seq; @@ -760,7 +763,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else - set_bit(MPTCP_ERROR_REPORT, &msk->flags); + __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number @@ -805,47 +808,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } -static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) +static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { - struct mptcp_subflow_context *subflow; - bool ret = false; + struct sock *sk = (struct sock *)msk; - if (likely(list_empty(&msk->join_list))) + if (sk->sk_state != TCP_ESTABLISHED) return false; - spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) { - u32 sseq = READ_ONCE(subflow->setsockopt_seq); - - mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); - if (READ_ONCE(msk->setsockopt_seq) != sseq) - ret = true; - } - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); - - return ret; -} - -void __mptcp_flush_join_list(struct mptcp_sock *msk) -{ - if (likely(!mptcp_do_flush_join_list(msk))) - return; + /* attach to msk socket only after we are sure we will deal with it + * at close time + */ + if (sk->sk_socket && !ssk->sk_socket) + mptcp_sock_graft(ssk, sk->sk_socket); - if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); + mptcp_propagate_sndbuf((struct sock *)msk, ssk); + mptcp_sockopt_sync_locked(msk, ssk); + return true; } -static void mptcp_flush_join_list(struct mptcp_sock *msk) +static void __mptcp_flush_join_list(struct sock *sk) { - bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); - - might_sleep(); + struct mptcp_subflow_context *tmp, *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); - if (!mptcp_do_flush_join_list(msk) && !sync_needed) - return; + list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); - mptcp_sockopt_sync_all(msk); + list_move_tail(&subflow->node, &msk->conn_list); + if (!__mptcp_finish_join(msk, ssk)) + mptcp_subflow_reset(ssk); + unlock_sock_fast(ssk, slow); + } } static bool mptcp_timer_pending(struct sock *sk) @@ -972,7 +966,9 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk) lockdep_assert_held_once(&sk->sk_lock.slock); - __mptcp_rmem_reclaim(sk, reclaimable - 1); + if (reclaimable > SK_MEM_QUANTUM) + __mptcp_rmem_reclaim(sk, reclaimable - 1); + sk_mem_reclaim_partial(sk); } @@ -1369,7 +1365,7 @@ out: struct subflow_send_info { struct sock *ssk; - u64 ratio; + u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) @@ -1394,20 +1390,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return __mptcp_subflow_active(subflow); } +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { - struct subflow_send_info send_info[2]; + struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; + u64 linger_time; long tout = 0; - u64 ratio; - u32 pace; sock_owned_by_me(sk); @@ -1426,10 +1426,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } /* pick the subflow with the lower wmem/wspace ratio */ - for (i = 0; i < 2; ++i) { + for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; - send_info[i].ratio = -1; + send_info[i].linger_time = -1; } + mptcp_for_each_subflow(msk, subflow) { trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); @@ -1438,34 +1439,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; - if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) - continue; - - pace = READ_ONCE(ssk->sk_pacing_rate); - if (!pace) - continue; + pace = subflow->avg_pacing_rate; + if (unlikely(!pace)) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } - ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, - pace); - if (ratio < send_info[subflow->backup].ratio) { + linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].ratio = ratio; + send_info[subflow->backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) - send_info[0].ssk = send_info[1].ssk; - - if (send_info[0].ssk) { - msk->last_snd = send_info[0].ssk; - msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - tcp_sk(msk->last_snd)->snd_wnd); - return msk->last_snd; - } + send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; + + /* According to the blest algorithm, to avoid HoL blocking for the + * faster flow, we need to: + * - estimate the faster flow linger time + * - use the above to estimate the amount of byte transferred + * by the faster flow + * - check that the amount of queued data is greter than the above, + * otherwise do not use the picked, slower, subflow + * We select the subflow with the shorter estimated time to flush + * the queued mem, which basically ensure the above. We just need + * to check that subflow has a non empty cwin. + */ + ssk = send_info[SSK_MODE_ACTIVE].ssk; + if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) + return NULL; - return NULL; + burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); + wmem = READ_ONCE(ssk->sk_wmem_queued); + subflow = mptcp_subflow_ctx(ssk); + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + + READ_ONCE(ssk->sk_pacing_rate) * burst, + burst + wmem); + msk->last_snd = ssk; + msk->snd_burst = burst; + return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) @@ -1499,11 +1517,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, msk->snd_nxt = snd_nxt_new; } -static void mptcp_check_and_set_pending(struct sock *sk) +void mptcp_check_and_set_pending(struct sock *sk) { - if (mptcp_send_head(sk) && - !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + if (mptcp_send_head(sk)) + mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } void __mptcp_push_pending(struct sock *sk, unsigned int flags) @@ -1524,7 +1541,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* First check. If the ssk has changed since @@ -1784,8 +1800,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, copied += count; if (count < data_len) { - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + } break; } @@ -1927,7 +1945,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -1965,6 +1982,27 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) return !skb_queue_empty(&msk->receive_queue); } +static unsigned int mptcp_inq_hint(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + const struct sk_buff *skb; + + skb = skb_peek(&msk->receive_queue); + if (skb) { + u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + + if (hint_val >= INT_MAX) + return INT_MAX; + + return (unsigned int)hint_val; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 1; + + return 0; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -1989,6 +2027,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + if (unlikely(msk->recvmsg_inq)) + cmsg_flags = MPTCP_CMSG_INQ; + while (copied < len) { int bytes_read; @@ -2062,6 +2103,12 @@ out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); + + if (cmsg_flags & MPTCP_CMSG_INQ) { + unsigned int inq = mptcp_inq_hint(sk); + + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d", @@ -2088,7 +2135,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ - set_bit(MPTCP_RETRANSMIT, &msk->flags); + __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); @@ -2196,6 +2243,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return true; } +/* flags for __mptcp_close_ssk() */ +#define MPTCP_CF_PUSH BIT(1) +#define MPTCP_CF_FASTCLOSE BIT(2) + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2205,22 +2256,37 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow) + struct mptcp_subflow_context *subflow, + unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push; + bool need_push, dispose_it; - list_del(&subflow->node); + dispose_it = !msk->subflow || ssk != msk->subflow->sk; + if (dispose_it) + list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + if (flags & MPTCP_CF_FASTCLOSE) + subflow->send_fastclose = 1; + + need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); + if (!dispose_it) { + tcp_disconnect(ssk, 0); + msk->subflow->state = SS_UNCONNECTED; + mptcp_subflow_ctx_reset(subflow); + release_sock(ssk); + + goto out; + } + /* if we are invoked by the msk cleanup code, the subflow is * already orphaned */ if (ssk->sk_socket) sock_orphan(ssk); - need_push = __mptcp_retransmit_pending_data(sk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2240,14 +2306,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, sock_put(ssk); - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (ssk == msk->first) msk->first = NULL; - if (msk->subflow && ssk == msk->subflow->sk) - mptcp_dispose_initial_subflow(msk); +out: + if (ssk == msk->last_snd) + msk->last_snd = NULL; if (need_push) __mptcp_push_pending(sk, 0); @@ -2258,7 +2322,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, { if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); - __mptcp_close_ssk(sk, ssk, subflow); + + /* subflow aborted before reaching the fully_established status + * attempt the creation of the next subflow + */ + mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); + + __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -2410,12 +2480,10 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); - if (msk->pm.status) - mptcp_pm_nl_work(msk); + mptcp_pm_nl_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -2449,8 +2517,6 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - spin_lock_init(&msk->join_list_lock); - INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); @@ -2476,9 +2542,20 @@ static int __mptcp_init_sock(struct sock *sk) return 0; } -static int mptcp_init_sock(struct sock *sk) +static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_assign_congestion_control(sk); + strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); + + /* no need to keep a reference to the ops, the name will suffice */ + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = NULL; +} + +static int mptcp_init_sock(struct sock *sk) +{ struct net *net = sock_net(sk); int ret; @@ -2499,12 +2576,7 @@ static int mptcp_init_sock(struct sock *sk) /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ - tcp_assign_congestion_control(sk); - strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); - - /* no need to keep a reference to the ops, the name will suffice */ - tcp_cleanup_congestion_control(sk); - icsk->icsk_ca_ops = NULL; + mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; @@ -2609,6 +2681,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) * state now */ if (__mptcp_check_fallback(msk)) { + WRITE_ONCE(msk->snd_una, msk->write_seq); if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_close_wake_up(sk); @@ -2617,7 +2690,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2650,21 +2722,20 @@ static void __mptcp_destroy_sock(struct sock *sk) might_sleep(); - /* be sure to always acquire the join list lock, to sync vs - * mptcp_finish_join(). - */ - spin_lock_bh(&msk->join_list_lock); - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); + /* join list will be eventually flushed (with rst) at sock lock release time*/ list_splice_init(&msk->conn_list, &conn_list); sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + /* clears msk->subflow, allowing the following loop to close + * even the initial subflow + */ + mptcp_dispose_initial_subflow(msk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow); + __mptcp_close_ssk(sk, ssk, subflow, 0); } sk->sk_prot->destroy(sk); @@ -2675,7 +2746,6 @@ static void __mptcp_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); - mptcp_dispose_initial_subflow(msk); sock_put(sk); } @@ -2711,6 +2781,9 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; @@ -2721,9 +2794,6 @@ cleanup: if (do_cancel_work) mptcp_cancel_work(sk); - if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); - sock_put(sk); } @@ -2755,15 +2825,38 @@ static int mptcp_disconnect(struct sock *sk, int flags) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_do_flush_join_list(msk); + inet_sk_state_store(sk, TCP_CLOSE); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - lock_sock(ssk); - tcp_disconnect(ssk, flags); - release_sock(ssk); + __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); } + + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + sk_stop_timer(sk, &sk->sk_timer); + + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + + mptcp_destroy_common(msk); + msk->last_snd = NULL; + WRITE_ONCE(msk->flags, 0); + msk->cb_flags = 0; + msk->push_pending = 0; + msk->recovery = false; + msk->can_ack = false; + msk->fully_established = false; + msk->rcv_data_fin = false; + msk->snd_data_fin_enable = false; + msk->rcv_fastclose = false; + msk->use_64bit_ack = false; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + mptcp_pm_data_reset(msk); + mptcp_ca_reset(sk); + + sk->sk_shutdown = 0; + sk_error_report(sk); return 0; } @@ -2903,9 +2996,11 @@ void mptcp_destroy_common(struct mptcp_sock *msk) __mptcp_clear_xmit(sk); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it @@ -2929,7 +3024,7 @@ void __mptcp_data_acked(struct sock *sk) if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else - set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); @@ -2948,20 +3043,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) else if (xmit_ssk) mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); } else { - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } } +#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ + BIT(MPTCP_RETRANSMIT) | \ + BIT(MPTCP_FLUSH_JOIN_LIST)) + /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) + __must_hold(&sk->sk_lock.slock) { - for (;;) { - unsigned long flags = 0; + struct mptcp_sock *msk = mptcp_sk(sk); - if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_PUSH_PENDING); - if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_RETRANSMIT); + for (;;) { + unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | + msk->push_pending; if (!flags) break; @@ -2972,8 +3070,11 @@ static void mptcp_release_cb(struct sock *sk) * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ - + msk->push_pending = 0; + msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) + __mptcp_flush_join_list(sk); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) @@ -2986,11 +3087,11 @@ static void mptcp_release_cb(struct sock *sk) /* be sure to set the current sk state before tacking actions * depending on sk_state */ - if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) __mptcp_set_connected(sk); - if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) + if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); __mptcp_update_rmem(sk); @@ -3032,7 +3133,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk); else - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); } @@ -3118,8 +3219,7 @@ bool mptcp_finish_join(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; - struct socket *parent_sock; - bool ret; + bool ret = true; pr_debug("msk=%p, subflow=%p", msk, subflow); @@ -3132,35 +3232,38 @@ bool mptcp_finish_join(struct sock *ssk) if (!msk->pm.server_side) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) { - subflow->reset_reason = MPTCP_RST_EPROHIBIT; - return false; - } + if (!mptcp_pm_allow_new_subflow(msk)) + goto err_prohibited; + + if (WARN_ON_ONCE(!list_empty(&subflow->node))) + goto err_prohibited; - /* active connections are already on conn_list, and we can't acquire - * msk lock here. - * use the join list lock as synchronization point and double-check - * msk status to avoid racing with __mptcp_destroy_sock() + /* active connections are already on conn_list. + * If we can't acquire msk socket lock here, let the release callback + * handle it */ - spin_lock_bh(&msk->join_list_lock); - ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { - list_add_tail(&subflow->node, &msk->join_list); + mptcp_data_lock(parent); + if (!sock_owned_by_user(parent)) { + ret = __mptcp_finish_join(msk, ssk); + if (ret) { + sock_hold(ssk); + list_add_tail(&subflow->node, &msk->conn_list); + } + } else { sock_hold(ssk); + list_add_tail(&subflow->node, &msk->join_list); + __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } - spin_unlock_bh(&msk->join_list_lock); + mptcp_data_unlock(parent); + if (!ret) { +err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } - /* attach to msk socket only after we are sure he will deal with us - * at close time - */ - parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !ssk->sk_socket) - mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); + out: mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; @@ -3179,6 +3282,57 @@ static int mptcp_forward_alloc_get(const struct sock *sk) return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; } +static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) +{ + const struct sock *sk = (void *)msk; + u64 delta; + + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + return 0; + + delta = msk->write_seq - v; + if (delta > INT_MAX) + delta = INT_MAX; + + return (int)delta; +} + +static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + bool slow; + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + __mptcp_move_skbs(msk); + answ = mptcp_inq_hint(sk); + release_sock(sk); + break; + case SIOCOUTQ: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); + unlock_sock_fast(sk, slow); + break; + case SIOCOUTQNSD: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, msk->snd_nxt); + unlock_sock_fast(sk, slow); + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -3191,6 +3345,7 @@ static struct proto mptcp_prot = { .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, + .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, @@ -3243,9 +3398,20 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; struct socket *ssock; - int err; + int err = -EINVAL; lock_sock(sock->sk); + if (uaddr) { + if (addr_len < sizeof(uaddr->sa_family)) + goto unlock; + + if (uaddr->sa_family == AF_UNSPEC) { + err = mptcp_disconnect(sock->sk, flags); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + goto unlock; + } + } + if (sock->state != SS_UNCONNECTED && msk->subflow) { /* pending connection or invalid state, let existing subflow * cope with that @@ -3255,10 +3421,8 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, } ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; + if (!ssock) goto unlock; - } mptcp_token_destroy(msk); inet_sk_state_store(sock->sk, TCP_SYN_SENT); @@ -3332,17 +3496,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - lock_sock(sock->sk); - if (sock->sk->sk_state != TCP_LISTEN) - goto unlock_fail; - ssock = __mptcp_nmpc_socket(msk); if (!ssock) - goto unlock_fail; - - clear_bit(MPTCP_DATA_READY, &msk->flags); - sock_hold(ssock->sk); - release_sock(sock->sk); + return -EINVAL; err = ssock->ops->accept(sock, newsock, flags, kern); if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { @@ -3372,7 +3528,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -3382,14 +3537,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, release_sock(newsk); } - if (inet_csk_listen_poll(ssock->sk)) - set_bit(MPTCP_DATA_READY, &msk->flags); - sock_put(ssock->sk); return err; - -unlock_fail: - release_sock(sock->sk); - return -EINVAL; } static __poll_t mptcp_check_readable(struct mptcp_sock *msk) @@ -3435,8 +3583,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); - if (state == TCP_LISTEN) - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; + if (state == TCP_LISTEN) { + if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + return 0; + + return inet_csk_listen_poll(msk->subflow->sk); + } if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); |