summaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c688
1 files changed, 344 insertions, 344 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6bd819047470..2d6b8de35c44 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,6 +11,8 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
+#include <net/aligned_data.h>
+#include <net/rps.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -46,7 +48,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
-DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
@@ -65,6 +69,26 @@ static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
return &inet_stream_ops;
}
+bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib)
+{
+ struct net *net = sock_net((struct sock *)msk);
+
+ if (__mptcp_check_fallback(msk))
+ return true;
+
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_infinite_fallback) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+
+ msk->allow_subflows = false;
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+ __MPTCP_INC_STATS(net, fb_mib);
+ spin_unlock_bh(&msk->fallback_lock);
+ return true;
+}
+
static int __mptcp_socket_create(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -114,19 +138,27 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
__kfree_skb(skb);
}
-static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size)
+static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from, bool *fragstolen,
+ int *delta)
{
- WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc,
- mptcp_sk(sk)->rmem_fwd_alloc + size);
-}
+ int limit = READ_ONCE(sk->sk_rcvbuf);
-static void mptcp_rmem_charge(struct sock *sk, int size)
-{
- mptcp_rmem_fwd_alloc_add(sk, -size);
+ if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) ||
+ MPTCP_SKB_CB(from)->offset ||
+ ((to->len + from->len) > (limit >> 3)) ||
+ !skb_try_coalesce(to, from, fragstolen, delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ return true;
}
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
@@ -135,22 +167,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
bool fragstolen;
int delta;
- if (MPTCP_SKB_CB(from)->offset ||
- ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) ||
- !skb_try_coalesce(to, from, &fragstolen, &delta))
+ if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta))
return false;
- pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
- MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
- to->len, MPTCP_SKB_CB(from)->end_seq);
- MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
-
/* note the fwd memory can reach a negative value after accounting
* for the delta, but the later skb free will restore a non
* negative one
*/
atomic_add(delta, &sk->sk_rmem_alloc);
- mptcp_rmem_charge(sk, delta);
+ sk_mem_charge(sk, delta);
kfree_skb_partial(from, fragstolen);
return true;
@@ -165,42 +190,42 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
-static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
-{
- amount >>= PAGE_SHIFT;
- mptcp_rmem_charge(sk, amount << PAGE_SHIFT);
- __sk_mem_reduce_allocated(sk, amount);
-}
-
-static void mptcp_rmem_uncharge(struct sock *sk, int size)
+/* "inspired" by tcp_rcvbuf_grow(), main difference:
+ * - mptcp does not maintain a msk-level window clamp
+ * - returns true when the receive buffer is actually updated
+ */
+static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int reclaimable;
+ const struct net *net = sock_net(sk);
+ u32 rcvwin, rcvbuf, cap, oldval;
+ u64 grow;
+
+ oldval = msk->rcvq_space.space;
+ msk->rcvq_space.space = newval;
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return false;
- mptcp_rmem_fwd_alloc_add(sk, size);
- reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
+ /* DRS is always one RTT late. */
+ rcvwin = newval << 1;
- /* see sk_mem_uncharge() for the rationale behind the following schema */
- if (unlikely(reclaimable >= PAGE_SIZE))
- __mptcp_rmem_reclaim(sk, reclaimable);
-}
+ /* slow start: allow the sender to double its rate. */
+ grow = (u64)rcvwin * (newval - oldval);
+ do_div(grow, oldval);
+ rcvwin += grow << 1;
-static void mptcp_rfree(struct sk_buff *skb)
-{
- unsigned int len = skb->truesize;
- struct sock *sk = skb->sk;
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
- atomic_sub(len, &sk->sk_rmem_alloc);
- mptcp_rmem_uncharge(sk, len);
-}
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
-void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = mptcp_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
- mptcp_rmem_charge(sk, skb->truesize);
+ rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ return true;
+ }
+ return false;
}
/* "inspired" by tcp_data_queue_ofo(), main differences:
@@ -315,58 +340,46 @@ merge_right:
end:
skb_condense(skb);
- mptcp_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
}
-static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
+static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
+ int copy_len)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
- int amt, amount;
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
- if (size <= msk->rmem_fwd_alloc)
- return true;
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
+ */
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
+ MPTCP_SKB_CB(skb)->cant_coalesce = 0;
- size -= msk->rmem_fwd_alloc;
- amt = sk_mem_pages(size);
- amount = amt << PAGE_SHIFT;
- if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
- return false;
+ __skb_unlink(skb, &ssk->sk_receive_queue);
- mptcp_rmem_fwd_alloc_add(sk, amount);
- return true;
+ skb_ext_reset(skb);
+ skb_dst_drop(skb);
}
-static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb, unsigned int offset,
- size_t copy_len)
+static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct sock *sk = (struct sock *)msk;
+ u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
+ struct mptcp_sock *msk = mptcp_sk(sk);
struct sk_buff *tail;
- bool has_rxtstamp;
-
- __skb_unlink(skb, &ssk->sk_receive_queue);
-
- skb_ext_reset(skb);
- skb_orphan(skb);
/* try to fetch required memory from subflow */
- if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) {
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
goto drop;
}
- has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
-
- /* the skb map_seq accounts for the skb offset:
- * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
- * value
- */
- MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
- MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
- MPTCP_SKB_CB(skb)->offset = offset;
- MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
-
if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
/* in sequence */
msk->bytes_received += copy_len;
@@ -375,7 +388,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
if (tail && mptcp_try_coalesce(sk, tail, skb))
return true;
- mptcp_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
@@ -413,6 +426,20 @@ static void mptcp_close_wake_up(struct sock *sk)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
+static void mptcp_shutdown_subflows(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ unlock_sock_fast(ssk, slow);
+ }
+}
+
/* called under the msk socket lock */
static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
@@ -437,6 +464,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
break;
case TCP_CLOSING:
case TCP_LAST_ACK:
+ mptcp_shutdown_subflows(msk);
mptcp_set_state(sk, TCP_CLOSE);
break;
}
@@ -487,7 +515,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl
const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
- inet_csk(ssk)->icsk_timeout - jiffies : 0;
+ icsk_timeout(inet_csk(ssk)) - jiffies : 0;
}
static void mptcp_set_timeout(struct sock *sk)
@@ -561,7 +589,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied)
bool cleanup, rx_empty;
cleanup = (space > 0) && (space >= (old_space << 1)) && copied;
- rx_empty = !__mptcp_rmem(sk) && copied;
+ rx_empty = !sk_rmem_alloc_get(sk) && copied;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -571,11 +599,10 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied)
}
}
-static bool mptcp_check_data_fin(struct sock *sk)
+static void mptcp_check_data_fin(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
u64 rcv_data_fin_seq;
- bool ret = false;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
@@ -605,6 +632,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
mptcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
+ mptcp_shutdown_subflows(msk);
mptcp_set_state(sk, TCP_CLOSE);
break;
default:
@@ -613,48 +641,28 @@ static bool mptcp_check_data_fin(struct sock *sk)
break;
}
- ret = true;
if (!__mptcp_check_fallback(msk))
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
- return ret;
}
static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
{
- if (READ_ONCE(msk->allow_infinite_fallback)) {
- MPTCP_INC_STATS(sock_net(ssk),
- MPTCP_MIB_DSSCORRUPTIONFALLBACK);
- mptcp_do_fallback(ssk);
- } else {
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSCORRUPTIONFALLBACK)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
mptcp_subflow_reset(ssk);
}
}
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
- struct sock *ssk,
- unsigned int *bytes)
+ struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
bool more_data_avail;
struct tcp_sock *tp;
- bool done = false;
- int sk_rbuf;
-
- sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
-
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
-
- if (unlikely(ssk_rbuf > sk_rbuf)) {
- WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
- sk_rbuf = ssk_rbuf;
- }
- }
+ bool ret = false;
pr_debug("msk=%p ssk=%p\n", msk, ssk);
tp = tcp_sk(ssk);
@@ -664,20 +672,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sk_buff *skb;
bool fin;
+ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
+ break;
+
/* try to move as much data as available */
map_remaining = subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue);
- if (!skb) {
- /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
- * a different CPU can have already processed the pending
- * data, stop here or we can enter an infinite loop
- */
- if (!moved)
- done = true;
+ if (unlikely(!skb))
break;
- }
if (__mptcp_check_fallback(msk)) {
/* Under fallback skbs have no MPTCP extension and TCP could
@@ -690,19 +694,15 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
- if (fin) {
- done = true;
+ if (fin)
seq++;
- }
if (offset < skb->len) {
size_t len = skb->len - offset;
- if (tp->urg_data)
- done = true;
-
- if (__mptcp_move_skb(msk, ssk, skb, offset, len))
- moved += len;
+ mptcp_init_skb(ssk, skb, offset, len);
+ skb_orphan(skb);
+ ret = __mptcp_move_skb(sk, skb) || ret;
seq += len;
if (unlikely(map_remaining < len)) {
@@ -716,22 +716,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
sk_eat_skb(ssk, skb);
- done = true;
}
WRITE_ONCE(tp->copied_seq, seq);
more_data_avail = mptcp_subflow_data_available(ssk);
- if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
- done = true;
- break;
- }
} while (more_data_avail);
- if (moved > 0)
+ if (ret)
msk->last_data_recv = tcp_jiffies32;
- *bytes += moved;
- return done;
+ return ret;
}
static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
@@ -825,16 +819,12 @@ void __mptcp_error_report(struct sock *sk)
static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
+ bool moved;
- __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ moved = __mptcp_move_skbs_from_subflow(msk, ssk);
__mptcp_ofo_queue(msk);
- if (unlikely(ssk->sk_err)) {
- if (!sock_owned_by_user(sk))
- __mptcp_error_report(sk);
- else
- __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
- }
+ if (unlikely(ssk->sk_err))
+ __mptcp_subflow_error_report(sk, ssk);
/* If the moves have caught up with the DATA_FIN sequence number
* it's time to ack the DATA_FIN and change socket state, but
@@ -843,14 +833,21 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
*/
if (mptcp_pending_data_fin(sk, NULL))
mptcp_schedule_work(sk);
- return moved > 0;
+ return moved;
+}
+
+static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* Wake-up the reader only for in-sequence data */
+ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
+ sk->sk_data_ready(sk);
}
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct mptcp_sock *msk = mptcp_sk(sk);
- int sk_rbuf, ssk_rbuf;
/* The peer can send data while we are shutting down this
* subflow at msk destruction time, but we must avoid enqueuing
@@ -859,26 +856,18 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
if (unlikely(subflow->disposable))
return;
- ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
- sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
- if (unlikely(ssk_rbuf > sk_rbuf))
- sk_rbuf = ssk_rbuf;
-
- /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
- if (__mptcp_rmem(sk) > sk_rbuf)
- return;
-
- /* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
- if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
- sk->sk_data_ready(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_data_ready(sk, ssk);
+ else
+ __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
{
mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ msk->allow_infinite_fallback = false;
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
}
@@ -889,6 +878,14 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
if (sk->sk_state != TCP_ESTABLISHED)
return false;
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
+ mptcp_subflow_joined(msk, ssk);
+ spin_unlock_bh(&msk->fallback_lock);
+
/* attach to msk socket only after we are sure we will deal with it
* at close time
*/
@@ -897,7 +894,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
mptcp_sockopt_sync_locked(msk, ssk);
- mptcp_subflow_joined(msk, ssk);
mptcp_stop_tout_timer(sk);
__mptcp_propagate_sndbuf(sk, ssk);
return true;
@@ -950,20 +946,6 @@ bool mptcp_schedule_work(struct sock *sk)
return false;
}
-static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
-
- msk_owned_by_me(msk);
-
- mptcp_for_each_subflow(msk, subflow) {
- if (READ_ONCE(subflow->data_avail))
- return mptcp_subflow_tcp_sock(subflow);
- }
-
- return NULL;
-}
-
static bool mptcp_skb_can_collapse_to(u64 write_seq,
const struct sk_buff *skb,
const struct mptcp_ext *mpext)
@@ -1025,7 +1007,7 @@ static void __mptcp_clean_una(struct sock *sk)
if (WARN_ON_ONCE(!msk->recovery))
break;
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
}
dfrag_clear(sk, dfrag);
@@ -1236,10 +1218,13 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
mpext->infinite_map = 1;
mpext->data_len = 0;
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ if (!mptcp_try_fallback(ssk, MPTCP_MIB_INFINITEMAPTX)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED);
+ mptcp_subflow_reset(ssk);
+ return;
+ }
+
mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
- pr_fallback(msk);
- mptcp_do_fallback(ssk);
}
#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
@@ -1314,7 +1299,12 @@ alloc_skb:
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
- if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) {
+ /* No need for zero probe if there are any data pending
+ * either at the msk or ssk level; skb is the current write
+ * queue tail and can be empty at this point.
+ */
+ if (snd_una != msk->snd_nxt || skb->len ||
+ skb != tcp_send_head(ssk)) {
tcp_remove_empty_skb(ssk);
return 0;
}
@@ -1365,6 +1355,7 @@ alloc_skb:
mpext->dsn64);
if (zero_window_probe) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
mpext->frozen = 1;
if (READ_ONCE(msk->csum_enabled))
@@ -1476,7 +1467,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
* - estimate the faster flow linger time
* - use the above to estimate the amount of byte transferred
* by the faster flow
- * - check that the amount of queued data is greter than the above,
+ * - check that the amount of queued data is greater than the above,
* otherwise do not use the picked, slower, subflow
* We select the subflow with the shorter estimated time to flush
* the queued mem, which basically ensure the above. We just need
@@ -1567,7 +1558,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
mptcp_update_post_push(msk, dfrag, ret);
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
if (msk->snd_burst <= 0 ||
!sk_stream_memory_free(ssk) ||
@@ -1813,6 +1804,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
return limit - not_sent;
}
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!rfs_is_needed())
+ return;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ sock_rps_record_flow(ssk);
+ }
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1826,6 +1831,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ mptcp_rps_record_subflows(msk);
+
if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
@@ -1911,7 +1918,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
if (!msk->first_pending)
- WRITE_ONCE(msk->first_pending, dfrag);
+ msk->first_pending = dfrag;
}
pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
@@ -1944,21 +1951,36 @@ do_error:
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
-static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
- struct msghdr *msg,
- size_t len, int flags,
+static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
+ struct mptcp_sock *msk = mptcp_sk(sk);
struct sk_buff *skb, *tmp;
+ int total_data_len = 0;
int copied = 0;
- skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
- u32 offset = MPTCP_SKB_CB(skb)->offset;
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
+ u32 delta, offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
- u32 count = min_t(size_t, len - copied, data_len);
+ u32 count;
int err;
+ if (flags & MSG_PEEK) {
+ /* skip already peeked skbs */
+ if (total_data_len + data_len <= copied_total) {
+ total_data_len += data_len;
+ continue;
+ }
+
+ /* skip the already peeked data in the current skb */
+ delta = copied_total - total_data_len;
+ offset += delta;
+ data_len -= delta;
+ }
+
+ count = min_t(size_t, len - copied, data_len);
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, count);
if (unlikely(err < 0)) {
@@ -1975,22 +1997,21 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
copied += count;
- if (count < data_len) {
- if (!(flags & MSG_PEEK)) {
+ if (!(flags & MSG_PEEK)) {
+ msk->bytes_consumed += count;
+ if (count < data_len) {
MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count;
- msk->bytes_consumed += count;
+ break;
}
- break;
- }
- if (!(flags & MSG_PEEK)) {
- /* we will bulk release the skb memory later */
+ /* avoid the indirect call, we know the destructor is sock_rfree */
skb->destructor = NULL;
- WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
- __skb_unlink(skb, &msk->receive_queue);
- __kfree_skb(skb);
- msk->bytes_consumed += count;
+ skb->sk = NULL;
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, skb->truesize);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ skb_attempt_defer_free(skb);
}
if (copied >= len)
@@ -2054,113 +2075,84 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- u64 rcvwin, grow;
- int rcvbuf;
-
- rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
- grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
- do_div(grow, msk->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
- if (rcvbuf > sk->sk_rcvbuf) {
- u32 window_clamp;
-
- window_clamp = mptcp_win_from_space(sk, rcvbuf);
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
- /* Make subflows follow along. If we do not do this, we
- * get drops at subflow level if skbs can't be moved to
- * the mptcp rx queue fast enough (announced rcv_win can
- * exceed ssk->sk_rcvbuf).
- */
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk;
- bool slow;
+ if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+ bool slow;
- ssk = mptcp_subflow_tcp_sock(subflow);
- slow = lock_sock_fast(ssk);
- WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
- WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
- if (tcp_can_send_ack(ssk))
- tcp_cleanup_rbuf(ssk, 1);
- unlock_sock_fast(ssk, slow);
- }
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ /* subflows can be added before tcp_init_transfer() */
+ if (tcp_sk(ssk)->rcvq_space.space)
+ tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);
+ unlock_sock_fast(ssk, slow);
}
}
- msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp;
}
-static void __mptcp_update_rmem(struct sock *sk)
+static struct mptcp_subflow_context *
+__mptcp_first_ready_from(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *start_subflow = subflow;
- if (!msk->rmem_released)
- return;
-
- atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
- mptcp_rmem_uncharge(sk, msk->rmem_released);
- WRITE_ONCE(msk->rmem_released, 0);
+ while (!READ_ONCE(subflow->data_avail)) {
+ subflow = mptcp_next_subflow(msk, subflow);
+ if (subflow == start_subflow)
+ return NULL;
+ }
+ return subflow;
}
-static void __mptcp_splice_receive_queue(struct sock *sk)
+static bool __mptcp_move_skbs(struct sock *sk)
{
+ struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool ret = false;
- skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
-}
-
-static bool __mptcp_move_skbs(struct mptcp_sock *msk)
-{
- struct sock *sk = (struct sock *)msk;
- unsigned int moved = 0;
- bool ret, done;
+ if (list_empty(&msk->conn_list))
+ return false;
- do {
- struct sock *ssk = mptcp_subflow_recv_lookup(msk);
+ subflow = list_first_entry(&msk->conn_list,
+ struct mptcp_subflow_context, node);
+ for (;;) {
+ struct sock *ssk;
bool slowpath;
- /* we can have data pending in the subflows only if the msk
- * receive buffer was full at subflow_data_ready() time,
- * that is an unlikely slow path.
+ /*
+ * As an optimization avoid traversing the subflows list
+ * and ev. acquiring the subflow socket lock before baling out
*/
- if (likely(!ssk))
+ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf)
break;
- slowpath = lock_sock_fast(ssk);
- mptcp_data_lock(sk);
- __mptcp_update_rmem(sk);
- done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
- mptcp_data_unlock(sk);
+ subflow = __mptcp_first_ready_from(msk, subflow);
+ if (!subflow)
+ break;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slowpath = lock_sock_fast(ssk);
+ ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret;
if (unlikely(ssk->sk_err))
__mptcp_error_report(sk);
unlock_sock_fast(ssk, slowpath);
- } while (!done);
- /* acquire the data lock only if some input data is pending */
- ret = moved > 0;
- if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
- !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
- mptcp_data_lock(sk);
- __mptcp_update_rmem(sk);
- ret |= __mptcp_ofo_queue(msk);
- __mptcp_splice_receive_queue(sk);
- mptcp_data_unlock(sk);
+ subflow = mptcp_next_subflow(msk, subflow);
}
+
+ __mptcp_ofo_queue(msk);
if (ret)
mptcp_check_data_fin((struct sock *)msk);
- return !skb_queue_empty(&msk->receive_queue);
+ return ret;
}
static unsigned int mptcp_inq_hint(const struct sock *sk)
@@ -2168,7 +2160,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
const struct mptcp_sock *msk = mptcp_sk(sk);
const struct sk_buff *skb;
- skb = skb_peek(&msk->receive_queue);
+ skb = skb_peek(&sk->sk_receive_queue);
if (skb) {
u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
@@ -2203,6 +2195,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
+ mptcp_rps_record_subflows(msk);
+
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
@@ -2214,7 +2208,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
while (copied < len) {
int err, bytes_read;
- bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
+ bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags,
+ copied, &tss, &cmsg_flags);
if (unlikely(bytes_read < 0)) {
if (!copied)
copied = bytes_read;
@@ -2223,7 +2218,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
+ if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk))
continue;
/* only the MPTCP socket status is relevant here. The exit
@@ -2245,14 +2240,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
break;
}
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- /* race breaker: the shutdown could be after the
- * previous receive queue check
- */
- if (__mptcp_move_skbs(msk))
- continue;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
- }
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
@@ -2293,9 +2282,8 @@ out_err:
}
}
- pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n",
- msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
- skb_queue_empty(&msk->receive_queue), copied);
+ pr_debug("msk=%p rx queue empty=%d copied=%d\n",
+ msk, skb_queue_empty(&sk->sk_receive_queue), copied);
release_sock(sk);
return copied;
@@ -2303,8 +2291,8 @@ out_err:
static void mptcp_retransmit_timer(struct timer_list *t)
{
- struct inet_connection_sock *icsk = from_timer(icsk, t,
- icsk_retransmit_timer);
+ struct inet_connection_sock *icsk = timer_container_of(icsk, t,
+ icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2323,7 +2311,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
static void mptcp_tout_timer(struct timer_list *t)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
+ struct sock *sk = timer_container_of(sk, t, sk_timer);
mptcp_schedule_work(sk);
sock_put(sk);
@@ -2643,9 +2631,9 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
static void __mptcp_retrans(struct sock *sk)
{
+ struct mptcp_sendmsg_info info = { .data_lock_held = true, };
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
- struct mptcp_sendmsg_info info = {};
struct mptcp_data_frag *dfrag;
struct sock *ssk;
int ret, err;
@@ -2660,7 +2648,8 @@ static void __mptcp_retrans(struct sock *sk)
if (mptcp_data_fin_enabled(msk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits,
+ icsk->icsk_retransmits + 1);
mptcp_set_datafin_timeout(sk);
mptcp_send_ack(msk);
@@ -2690,6 +2679,18 @@ static void __mptcp_retrans(struct sock *sk)
info.sent = 0;
info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
dfrag->already_sent;
+
+ /*
+ * make the whole retrans decision, xmit, disallow
+ * fallback atomic
+ */
+ spin_lock_bh(&msk->fallback_lock);
+ if (__mptcp_check_fallback(msk)) {
+ spin_unlock_bh(&msk->fallback_lock);
+ release_sock(ssk);
+ return;
+ }
+
while (info.sent < info.limit) {
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
@@ -2703,8 +2704,9 @@ static void __mptcp_retrans(struct sock *sk)
len = max(copied, len);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ msk->allow_infinite_fallback = false;
}
+ spin_unlock_bh(&msk->fallback_lock);
release_sock(ssk);
}
@@ -2783,7 +2785,7 @@ static void mptcp_worker(struct work_struct *work)
mptcp_check_fastclose(msk);
- mptcp_pm_nl_work(msk);
+ mptcp_pm_worker(msk);
mptcp_check_send_data_fin(sk);
mptcp_check_data_fin_ack(sk);
@@ -2822,18 +2824,16 @@ static void __mptcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
INIT_WORK(&msk->work, mptcp_worker);
- __skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
- WRITE_ONCE(msk->rmem_fwd_alloc, 0);
- WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
- WRITE_ONCE(msk->allow_infinite_fallback, true);
+ msk->allow_infinite_fallback = true;
+ msk->allow_subflows = true;
msk->recovery = false;
msk->subflow_id = 1;
msk->last_data_sent = tcp_jiffies32;
@@ -2841,6 +2841,7 @@ static void __mptcp_init_sock(struct sock *sk)
msk->last_ack_recv = tcp_jiffies32;
mptcp_pm_data_init(msk);
+ spin_lock_init(&msk->fallback_lock);
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
@@ -2899,7 +2900,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- WRITE_ONCE(msk->first_pending, NULL);
+ msk->first_pending = NULL;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
}
@@ -3052,8 +3053,6 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
- WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc));
- WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
@@ -3222,7 +3221,16 @@ static int mptcp_disconnect(struct sock *sk, int flags)
* subflow
*/
mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+
+ /* The first subflow is already in TCP_CLOSE status, the following
+ * can't overlap with a fallback anymore
+ */
+ spin_lock_bh(&msk->fallback_lock);
+ msk->allow_subflows = true;
+ msk->allow_infinite_fallback = true;
WRITE_ONCE(msk->flags, 0);
+ spin_unlock_bh(&msk->fallback_lock);
+
msk->cb_flags = 0;
msk->recovery = false;
WRITE_ONCE(msk->can_ack, false);
@@ -3249,9 +3257,9 @@ static int mptcp_disconnect(struct sock *sk, int flags)
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
- unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
+ struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk);
- return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+ return &msk6->np;
}
static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk)
@@ -3285,12 +3293,9 @@ static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- newopt = sock_kmalloc(newsk, sizeof(*inet_opt) +
+ newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) +
inet_opt->opt.optlen, GFP_ATOMIC);
- if (newopt)
- memcpy(newopt, inet_opt, sizeof(*inet_opt) +
- inet_opt->opt.optlen);
- else
+ if (!newopt)
net_warn_ratelimited("%s: Failed to copy ip options\n", __func__);
}
RCU_INIT_POINTER(newinet->inet_opt, newopt);
@@ -3405,21 +3410,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
mptcp_for_each_subflow_safe(msk, subflow, tmp)
__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
- /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
- mptcp_data_lock(sk);
- skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
- mptcp_data_unlock(sk);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it
*/
- sk_forward_alloc_add(sk, msk->rmem_fwd_alloc);
- WRITE_ONCE(msk->rmem_fwd_alloc, 0);
mptcp_token_destroy(msk);
- mptcp_pm_free_anno_list(msk);
- mptcp_free_local_addr_list(msk);
+ mptcp_pm_destroy(msk);
}
static void mptcp_destroy(struct sock *sk)
@@ -3442,9 +3440,6 @@ void __mptcp_data_acked(struct sock *sk)
void __mptcp_check_push(struct sock *sk, struct sock *ssk)
{
- if (!mptcp_send_head(sk))
- return;
-
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk, false);
else
@@ -3453,7 +3448,8 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
BIT(MPTCP_RETRANSMIT) | \
- BIT(MPTCP_FLUSH_JOIN_LIST))
+ BIT(MPTCP_FLUSH_JOIN_LIST) | \
+ BIT(MPTCP_DEQUEUE))
/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
@@ -3487,6 +3483,11 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT))
__mptcp_retrans(sk);
+ if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
+ /* notify ack seq update */
+ mptcp_cleanup_rbuf(msk, 0);
+ sk->sk_data_ready(sk);
+ }
cond_resched();
spin_lock_bh(&sk->sk_lock.slock);
@@ -3506,8 +3507,6 @@ static void mptcp_release_cb(struct sock *sk)
if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
__mptcp_sync_sndbuf(sk);
}
-
- __mptcp_update_rmem(sk);
}
/* MP_JOIN client subflow must wait for 4th ack before sending any data:
@@ -3533,7 +3532,6 @@ static void schedule_3rdack_retransmission(struct sock *ssk)
WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
smp_store_release(&icsk->icsk_ack.pending,
icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
- icsk->icsk_ack.timeout = timeout;
sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
}
@@ -3615,7 +3613,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
sk_set_socket(sk, parent);
- sk->sk_uid = SOCK_INODE(parent)->i_uid;
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -3636,13 +3633,21 @@ bool mptcp_finish_join(struct sock *ssk)
/* active subflow, already present inside the conn_list */
if (!list_empty(&subflow->node)) {
+ spin_lock_bh(&msk->fallback_lock);
+ if (!msk->allow_subflows) {
+ spin_unlock_bh(&msk->fallback_lock);
+ return false;
+ }
mptcp_subflow_joined(msk, ssk);
+ spin_unlock_bh(&msk->fallback_lock);
mptcp_propagate_sndbuf(parent, ssk);
return true;
}
- if (!mptcp_pm_allow_new_subflow(msk))
+ if (!mptcp_pm_allow_new_subflow(msk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED);
goto err_prohibited;
+ }
/* If we can't acquire msk socket lock here, let the release callback
* handle it
@@ -3678,12 +3683,6 @@ static void mptcp_shutdown(struct sock *sk, int how)
__mptcp_wr_shutdown(sk);
}
-static int mptcp_forward_alloc_get(const struct sock *sk)
-{
- return READ_ONCE(sk->sk_forward_alloc) +
- READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc);
-}
-
static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
{
const struct sock *sk = (void *)msk;
@@ -3724,7 +3723,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
return -EINVAL;
lock_sock(sk);
- __mptcp_move_skbs(msk);
+ if (__mptcp_move_skbs(sk))
+ mptcp_cleanup_rbuf(msk, 0);
*karg = mptcp_inq_hint(sk);
release_sock(sk);
break;
@@ -3763,16 +3763,15 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* TCP option space.
*/
if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
- mptcp_subflow_early_fallback(msk, subflow);
+ mptcp_early_fallback(msk, subflow, MPTCP_MIB_MD5SIGFALLBACK);
#endif
if (subflow->request_mptcp) {
- if (mptcp_active_should_disable(sk)) {
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED);
- mptcp_subflow_early_fallback(msk, subflow);
- } else if (mptcp_token_new_connect(ssk) < 0) {
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
- }
+ if (mptcp_active_should_disable(sk))
+ mptcp_early_fallback(msk, subflow,
+ MPTCP_MIB_MPCAPABLEACTIVEDISABLED);
+ else if (mptcp_token_new_connect(ssk) < 0)
+ mptcp_early_fallback(msk, subflow,
+ MPTCP_MIB_TOKENFALLBACKINIT);
}
WRITE_ONCE(msk->write_seq, subflow->idsn);
@@ -3841,11 +3840,10 @@ static struct proto mptcp_prot = {
.hash = mptcp_hash,
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
- .forward_alloc_get = mptcp_forward_alloc_get,
.stream_memory_free = mptcp_stream_memory_free,
.sockets_allocated = &mptcp_sockets_allocated,
- .memory_allocated = &tcp_memory_allocated,
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
@@ -3981,6 +3979,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_sock_graft(ssk, newsock);
}
+ mptcp_rps_record_subflows(msk);
+
/* Do late cleanup for the first subflow as necessary. Also
* deal with bad peers not doing a complete shutdown.
*/