summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/options.c109
-rw-r--r--net/mptcp/pm.c34
-rw-r--r--net/mptcp/pm_netlink.c215
-rw-r--r--net/mptcp/protocol.c472
-rw-r--r--net/mptcp/protocol.h69
-rw-r--r--net/mptcp/sockopt.c262
-rw-r--r--net/mptcp/subflow.c34
-rw-r--r--net/mptcp/token.c1
8 files changed, 846 insertions, 350 deletions
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 6661b1d6520f..645dd984fef0 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -768,6 +768,28 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu
return true;
}
+static bool mptcp_established_options_fastclose(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ if (likely(!subflow->send_fastclose))
+ return false;
+
+ if (remaining < TCPOLEN_MPTCP_FASTCLOSE)
+ return false;
+
+ *size = TCPOLEN_MPTCP_FASTCLOSE;
+ opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
+ opts->rcvr_key = msk->remote_key;
+
+ pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ return true;
+}
+
static bool mptcp_established_options_mp_fail(struct sock *sk,
unsigned int *size,
unsigned int remaining,
@@ -806,10 +828,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
return false;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
- if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
+ if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
+ mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
}
+ /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
@@ -1212,7 +1236,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp)
WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
}
-static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum)
+u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
{
struct csum_pseudo_header header;
__wsum csum;
@@ -1227,14 +1251,14 @@ static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum1
header.data_len = htons(data_len);
header.csum = 0;
- csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum));
+ csum = csum_partial(&header, sizeof(header), sum);
return (__force u16)csum_fold(csum);
}
static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
{
return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
- mpext->csum);
+ ~csum_unfold(mpext->csum));
}
void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
@@ -1254,17 +1278,8 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 2;
}
- /* RST is mutually exclusive with everything else */
- if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
- *ptr++ = mptcp_option(MPTCPOPT_RST,
- TCPOLEN_MPTCP_RST,
- opts->reset_transient,
- opts->reset_reason);
- return;
- }
-
- /* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see
- * mptcp_established_options*()
+ /* DSS, MPC, MPJ, ADD_ADDR, FASTCLOSE and RST are mutually exclusive,
+ * see mptcp_established_options*()
*/
if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
struct mptcp_ext *mpext = &opts->ext_copy;
@@ -1365,7 +1380,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
__mptcp_make_csum(opts->data_seq,
opts->subflow_seq,
opts->data_len,
- opts->csum), ptr);
+ ~csum_unfold(opts->csum)), ptr);
} else {
put_unaligned_be32(opts->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
@@ -1374,27 +1389,29 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
/* MPC is additionally mutually exclusive with MP_PRIO */
goto mp_capable_done;
- } else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
- *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
- TCPOLEN_MPTCP_MPJ_SYN,
- opts->backup, opts->join_id);
- put_unaligned_be32(opts->token, ptr);
- ptr += 1;
- put_unaligned_be32(opts->nonce, ptr);
- ptr += 1;
- } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
- *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
- TCPOLEN_MPTCP_MPJ_SYNACK,
- opts->backup, opts->join_id);
- put_unaligned_be64(opts->thmac, ptr);
- ptr += 2;
- put_unaligned_be32(opts->nonce, ptr);
- ptr += 1;
- } else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
- *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
- TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
- memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
- ptr += 5;
+ } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
+ if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYN,
+ opts->backup, opts->join_id);
+ put_unaligned_be32(opts->token, ptr);
+ ptr += 1;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYNACK,
+ opts->backup, opts->join_id);
+ put_unaligned_be64(opts->thmac, ptr);
+ ptr += 2;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ } else {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+ memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+ ptr += 5;
+ }
} else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
u8 echo = MPTCP_ADDR_ECHO;
@@ -1451,6 +1468,24 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
ptr += 1;
}
}
+ } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) {
+ /* FASTCLOSE is mutually exclusive with others except RST */
+ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE,
+ TCPOLEN_MPTCP_FASTCLOSE,
+ 0, 0);
+ put_unaligned_be64(opts->rcvr_key, ptr);
+ ptr += 2;
+
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ goto mp_rst;
+ return;
+ } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
+mp_rst:
+ *ptr++ = mptcp_option(MPTCPOPT_RST,
+ TCPOLEN_MPTCP_RST,
+ opts->reset_transient,
+ opts->reset_reason);
+ return;
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 6ab386ff3294..696b2c4613a7 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -172,9 +172,28 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk)
spin_unlock_bh(&pm->lock);
}
-void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+ const struct mptcp_subflow_context *subflow)
{
- pr_debug("msk=%p", msk);
+ struct mptcp_pm_data *pm = &msk->pm;
+ bool update_subflows;
+
+ update_subflows = (ssk->sk_state == TCP_CLOSE) &&
+ (subflow->request_join || subflow->mp_join);
+ if (!READ_ONCE(pm->work_pending) && !update_subflows)
+ return;
+
+ spin_lock_bh(&pm->lock);
+ if (update_subflows)
+ pm->subflows--;
+
+ /* Even if this subflow is not really established, tell the PM to try
+ * to pick the next ones, if possible.
+ */
+ if (mptcp_pm_nl_check_work_pending(msk))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
}
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
@@ -356,7 +375,7 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
}
}
-void mptcp_pm_data_init(struct mptcp_sock *msk)
+void mptcp_pm_data_reset(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0;
@@ -370,11 +389,16 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
WRITE_ONCE(msk->pm.accept_subflow, false);
WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
msk->pm.status = 0;
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+
+ mptcp_pm_nl_data_init(msk);
+}
+void mptcp_pm_data_init(struct mptcp_sock *msk)
+{
spin_lock_init(&msk->pm.lock);
INIT_LIST_HEAD(&msk->pm.anno_list);
-
- mptcp_pm_nl_data_init(msk);
+ mptcp_pm_data_reset(msk);
}
void __init mptcp_pm_init(void)
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index f523051f5aef..75af1f701e1d 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -38,9 +38,6 @@ struct mptcp_pm_add_entry {
u8 retrans_times;
};
-#define MAX_ADDR_ID 255
-#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
-
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
@@ -52,14 +49,14 @@ struct pm_nl_pernet {
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
- unsigned long id_bitmap[BITMAP_SZ];
+ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
};
#define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3
static bool addresses_equal(const struct mptcp_addr_info *a,
- struct mptcp_addr_info *b, bool use_port)
+ const struct mptcp_addr_info *b, bool use_port)
{
bool addr_equals = false;
@@ -168,11 +165,13 @@ select_local_address(const struct pm_nl_pernet *pernet,
msk_owned_by_me(msk);
rcu_read_lock();
- __mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
if (entry->addr.family != sk->sk_family) {
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
if ((entry->addr.family == AF_INET &&
@@ -183,23 +182,17 @@ select_local_address(const struct pm_nl_pernet *pernet,
continue;
}
- /* avoid any address already in use by subflows and
- * pending join
- */
- if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
- ret = entry;
- break;
- }
+ ret = entry;
+ break;
}
rcu_read_unlock();
return ret;
}
static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
+select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
- int i = 0;
rcu_read_lock();
/* do not keep any additional per socket state, just signal
@@ -208,12 +201,14 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
continue;
- if (i++ == pos) {
- ret = entry;
- break;
- }
+
+ ret = entry;
+ break;
}
rcu_read_unlock();
return ret;
@@ -255,12 +250,17 @@ unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
-static void check_work_pending(struct mptcp_sock *msk)
+bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
- if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) &&
- (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) ||
- msk->pm.subflows == mptcp_pm_get_subflows_max(msk)))
+ struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
+ (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false);
+ return false;
+ }
+ return true;
}
struct mptcp_pm_add_entry *
@@ -429,6 +429,7 @@ static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr
static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
struct mptcp_addr_info *addrs)
{
+ bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info remote = { 0 };
@@ -436,22 +437,28 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
int i = 0;
subflows_max = mptcp_pm_get_subflows_max(msk);
+ remote_address((struct sock_common *)sk, &remote);
/* Non-fullmesh endpoint, fill in the single entry
* corresponding to the primary MPC subflow remote address
*/
if (!fullmesh) {
- remote_address((struct sock_common *)sk, &remote);
+ if (deny_id0)
+ return 0;
+
msk->pm.subflows++;
addrs[i++] = remote;
} else {
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
- remote_address((struct sock_common *)ssk, &remote);
- if (!lookup_address_in_vec(addrs, i, &remote) &&
+ remote_address((struct sock_common *)ssk, &addrs[i]);
+ if (deny_id0 && addresses_equal(&addrs[i], &remote, false))
+ continue;
+
+ if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
msk->pm.subflows < subflows_max) {
msk->pm.subflows++;
- addrs[i++] = remote;
+ i++;
}
}
}
@@ -459,6 +466,35 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
return i;
}
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static int
+lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_addr_entry *entry;
+ int ret = -1;
+
+ rcu_read_lock();
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, addr, entry->addr.port)) {
+ ret = entry->addr.id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
@@ -474,6 +510,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
local_addr_max = mptcp_pm_get_local_addr_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
+ /* do lazy endpoint usage accounting for the MPC subflows */
+ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
+ struct mptcp_addr_info mpc_addr;
+ int mpc_id;
+
+ local_address((struct sock_common *)msk->first, &mpc_addr);
+ mpc_id = lookup_id_by_addr(pernet, &mpc_addr);
+ if (mpc_id >= 0)
+ __clear_bit(mpc_id, msk->pm.id_avail_bitmap);
+
+ msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+ }
+
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, local_addr_max,
msk->pm.add_addr_signaled, add_addr_signal_max,
@@ -481,47 +530,41 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* check first for announce */
if (msk->pm.add_addr_signaled < add_addr_signal_max) {
- local = select_signal_address(pernet,
- msk->pm.add_addr_signaled);
+ local = select_signal_address(pernet, msk);
if (local) {
if (mptcp_pm_alloc_anno_list(msk, local)) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
mptcp_pm_announce_addr(msk, &local->addr, false);
mptcp_pm_nl_addr_send_ack(msk);
}
- } else {
- /* pick failed, avoid fourther attempts later */
- msk->pm.local_addr_used = add_addr_signal_max;
}
-
- check_work_pending(msk);
}
/* check if should create a new subflow */
- if (msk->pm.local_addr_used < local_addr_max &&
- msk->pm.subflows < subflows_max &&
- !READ_ONCE(msk->pm.remote_deny_join_id0)) {
+ while (msk->pm.local_addr_used < local_addr_max &&
+ msk->pm.subflows < subflows_max) {
+ struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+ bool fullmesh;
+ int i, nr;
+
local = select_local_address(pernet, msk);
- if (local) {
- bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
- struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
- int i, nr;
+ if (!local)
+ break;
- msk->pm.local_addr_used++;
- check_work_pending(msk);
- nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
- spin_unlock_bh(&msk->pm.lock);
- for (i = 0; i < nr; i++)
- __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
- spin_lock_bh(&msk->pm.lock);
- return;
- }
+ fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
- /* lookup failed, avoid fourther attempts later */
- msk->pm.local_addr_used = local_addr_max;
- check_work_pending(msk);
+ msk->pm.local_addr_used++;
+ nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
+ if (nr)
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+ spin_unlock_bh(&msk->pm.lock);
+ for (i = 0; i < nr; i++)
+ __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
+ spin_lock_bh(&msk->pm.lock);
}
+ mptcp_pm_nl_check_work_pending(msk);
}
static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
@@ -551,7 +594,6 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
subflows_max = mptcp_pm_get_subflows_max(msk);
rcu_read_lock();
- __mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
@@ -640,7 +682,6 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
!mptcp_pm_should_rm_signal(msk))
return;
- __mptcp_flush_join_list(msk);
subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
if (subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -710,6 +751,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
return;
for (i = 0; i < rm_list->nr; i++) {
+ bool removed = false;
+
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
@@ -726,18 +769,24 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
i, rm_list->ids[i], subflow->local_id, subflow->remote_id);
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
+
+ /* the following takes care of updating the subflows counter */
mptcp_close_ssk(sk, ssk, subflow);
spin_lock_bh(&msk->pm.lock);
- if (rm_type == MPTCP_MIB_RMADDR) {
- msk->pm.add_addr_accepted--;
- WRITE_ONCE(msk->pm.accept_addr, true);
- } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
- msk->pm.local_addr_used--;
- }
- msk->pm.subflows--;
+ removed = true;
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
+ __set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap);
+ if (!removed)
+ continue;
+
+ if (rm_type == MPTCP_MIB_RMADDR) {
+ msk->pm.add_addr_accepted--;
+ WRITE_ONCE(msk->pm.accept_addr, true);
+ } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
+ msk->pm.local_addr_used--;
+ }
}
}
@@ -758,6 +807,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk)
msk_owned_by_me(msk);
+ if (!(pm->status & MPTCP_PM_WORK_MASK))
+ return;
+
spin_lock_bh(&msk->pm.lock);
pr_debug("msk=%p status=%x", msk, pm->status);
@@ -803,7 +855,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
- if (pernet->next_id == MAX_ADDR_ID)
+ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
goto out;
@@ -823,16 +875,15 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
if (!entry->addr.id) {
find_next:
entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
- MAX_ADDR_ID + 1,
+ MPTCP_PM_MAX_ADDR_ID + 1,
pernet->next_id);
- if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) &&
- pernet->next_id != 1) {
+ if (!entry->addr.id && pernet->next_id != 1) {
pernet->next_id = 1;
goto find_next;
}
}
- if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID)
+ if (!entry->addr.id)
goto out;
__set_bit(entry->addr.id, pernet->id_bitmap);
@@ -1191,18 +1242,6 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static struct mptcp_pm_addr_entry *
-__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
-{
- struct mptcp_pm_addr_entry *entry;
-
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (entry->addr.id == id)
- return entry;
- }
- return NULL;
-}
-
int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
u8 *flags, int *ifindex)
{
@@ -1461,7 +1500,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
pernet->next_id = 1;
- bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
+ bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
synchronize_rcu();
@@ -1571,7 +1610,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
pernet = net_generic(net, pm_nl_pernet_id);
spin_lock_bh(&pernet->lock);
- for (i = id; i < MAX_ADDR_ID + 1; i++) {
+ for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
if (test_bit(i, pernet->id_bitmap)) {
entry = __lookup_addr_by_id(pernet, i);
if (!entry)
@@ -1705,22 +1744,28 @@ next:
static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
{
+ struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
- struct mptcp_pm_addr_entry addr, *entry;
struct net *net = sock_net(skb->sk);
- u8 bkup = 0;
+ u8 bkup = 0, lookup_by_id = 0;
int ret;
- ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
if (ret < 0)
return ret;
if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
bkup = 1;
+ if (addr.addr.family == AF_UNSPEC) {
+ lookup_by_id = 1;
+ if (!addr.addr.id)
+ return -EOPNOTSUPP;
+ }
list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (addresses_equal(&entry->addr, &addr.addr, true)) {
+ if ((!lookup_by_id && addresses_equal(&entry->addr, &addr.addr, true)) ||
+ (lookup_by_id && entry->addr.id == addr.addr.id)) {
mptcp_nl_addr_backup(net, &entry->addr, bkup);
if (bkup)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 0cd55e4c30fa..f60f01b14fac 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -22,6 +22,7 @@
#endif
#include <net/mptcp.h>
#include <net/xfrm.h>
+#include <asm/ioctls.h>
#include "protocol.h"
#include "mib.h"
@@ -46,9 +47,10 @@ struct mptcp_skb_cb {
enum {
MPTCP_CMSG_TS = BIT(0),
+ MPTCP_CMSG_INQ = BIT(1),
};
-static struct percpu_counter mptcp_sockets_allocated;
+static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
@@ -738,6 +740,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
delta);
MPTCP_SKB_CB(skb)->offset += delta;
+ MPTCP_SKB_CB(skb)->map_seq += delta;
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
msk->ack_seq = end_seq;
@@ -760,7 +763,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
if (!sock_owned_by_user(sk))
__mptcp_error_report(sk);
else
- set_bit(MPTCP_ERROR_REPORT, &msk->flags);
+ __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
}
/* If the moves have caught up with the DATA_FIN sequence number
@@ -805,47 +808,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
mptcp_data_unlock(sk);
}
-static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
+static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{
- struct mptcp_subflow_context *subflow;
- bool ret = false;
+ struct sock *sk = (struct sock *)msk;
- if (likely(list_empty(&msk->join_list)))
+ if (sk->sk_state != TCP_ESTABLISHED)
return false;
- spin_lock_bh(&msk->join_list_lock);
- list_for_each_entry(subflow, &msk->join_list, node) {
- u32 sseq = READ_ONCE(subflow->setsockopt_seq);
-
- mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
- if (READ_ONCE(msk->setsockopt_seq) != sseq)
- ret = true;
- }
- list_splice_tail_init(&msk->join_list, &msk->conn_list);
- spin_unlock_bh(&msk->join_list_lock);
-
- return ret;
-}
-
-void __mptcp_flush_join_list(struct mptcp_sock *msk)
-{
- if (likely(!mptcp_do_flush_join_list(msk)))
- return;
+ /* attach to msk socket only after we are sure we will deal with it
+ * at close time
+ */
+ if (sk->sk_socket && !ssk->sk_socket)
+ mptcp_sock_graft(ssk, sk->sk_socket);
- if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
- mptcp_schedule_work((struct sock *)msk);
+ mptcp_propagate_sndbuf((struct sock *)msk, ssk);
+ mptcp_sockopt_sync_locked(msk, ssk);
+ return true;
}
-static void mptcp_flush_join_list(struct mptcp_sock *msk)
+static void __mptcp_flush_join_list(struct sock *sk)
{
- bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags);
-
- might_sleep();
+ struct mptcp_subflow_context *tmp, *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
- if (!mptcp_do_flush_join_list(msk) && !sync_needed)
- return;
+ list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
- mptcp_sockopt_sync_all(msk);
+ list_move_tail(&subflow->node, &msk->conn_list);
+ if (!__mptcp_finish_join(msk, ssk))
+ mptcp_subflow_reset(ssk);
+ unlock_sock_fast(ssk, slow);
+ }
}
static bool mptcp_timer_pending(struct sock *sk)
@@ -1371,7 +1365,7 @@ out:
struct subflow_send_info {
struct sock *ssk;
- u64 ratio;
+ u64 linger_time;
};
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1396,20 +1390,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return __mptcp_subflow_active(subflow);
}
+#define SSK_MODE_ACTIVE 0
+#define SSK_MODE_BACKUP 1
+#define SSK_MODE_MAX 2
+
/* implement the mptcp packet scheduler;
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
- struct subflow_send_info send_info[2];
+ struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
+ u32 pace, burst, wmem;
int i, nr_active = 0;
struct sock *ssk;
+ u64 linger_time;
long tout = 0;
- u64 ratio;
- u32 pace;
sock_owned_by_me(sk);
@@ -1428,10 +1426,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
/* pick the subflow with the lower wmem/wspace ratio */
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
- send_info[i].ratio = -1;
+ send_info[i].linger_time = -1;
}
+
mptcp_for_each_subflow(msk, subflow) {
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
@@ -1440,34 +1439,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
- if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
- continue;
-
- pace = READ_ONCE(ssk->sk_pacing_rate);
- if (!pace)
- continue;
+ pace = subflow->avg_pacing_rate;
+ if (unlikely(!pace)) {
+ /* init pacing rate from socket */
+ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+ pace = subflow->avg_pacing_rate;
+ if (!pace)
+ continue;
+ }
- ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
- pace);
- if (ratio < send_info[subflow->backup].ratio) {
+ linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+ if (linger_time < send_info[subflow->backup].linger_time) {
send_info[subflow->backup].ssk = ssk;
- send_info[subflow->backup].ratio = ratio;
+ send_info[subflow->backup].linger_time = linger_time;
}
}
__mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
- send_info[0].ssk = send_info[1].ssk;
-
- if (send_info[0].ssk) {
- msk->last_snd = send_info[0].ssk;
- msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- tcp_sk(msk->last_snd)->snd_wnd);
- return msk->last_snd;
- }
+ send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+ /* According to the blest algorithm, to avoid HoL blocking for the
+ * faster flow, we need to:
+ * - estimate the faster flow linger time
+ * - use the above to estimate the amount of byte transferred
+ * by the faster flow
+ * - check that the amount of queued data is greter than the above,
+ * otherwise do not use the picked, slower, subflow
+ * We select the subflow with the shorter estimated time to flush
+ * the queued mem, which basically ensure the above. We just need
+ * to check that subflow has a non empty cwin.
+ */
+ ssk = send_info[SSK_MODE_ACTIVE].ssk;
+ if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
+ return NULL;
- return NULL;
+ burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
+ wmem = READ_ONCE(ssk->sk_wmem_queued);
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+ READ_ONCE(ssk->sk_pacing_rate) * burst,
+ burst + wmem);
+ msk->last_snd = ssk;
+ msk->snd_burst = burst;
+ return ssk;
}
static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
@@ -1501,11 +1517,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
msk->snd_nxt = snd_nxt_new;
}
-static void mptcp_check_and_set_pending(struct sock *sk)
+void mptcp_check_and_set_pending(struct sock *sk)
{
- if (mptcp_send_head(sk) &&
- !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ if (mptcp_send_head(sk))
+ mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
}
void __mptcp_push_pending(struct sock *sk, unsigned int flags)
@@ -1526,7 +1541,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0;
prev_ssk = ssk;
- __mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
/* First check. If the ssk has changed since
@@ -1786,8 +1800,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
copied += count;
if (count < data_len) {
- if (!(flags & MSG_PEEK))
+ if (!(flags & MSG_PEEK)) {
MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ }
break;
}
@@ -1929,7 +1945,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool ret, done;
- mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
@@ -1967,6 +1982,27 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
return !skb_queue_empty(&msk->receive_queue);
}
+static unsigned int mptcp_inq_hint(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct sk_buff *skb;
+
+ skb = skb_peek(&msk->receive_queue);
+ if (skb) {
+ u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ if (hint_val >= INT_MAX)
+ return INT_MAX;
+
+ return (unsigned int)hint_val;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 1;
+
+ return 0;
+}
+
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -1991,6 +2027,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ if (unlikely(msk->recvmsg_inq))
+ cmsg_flags = MPTCP_CMSG_INQ;
+
while (copied < len) {
int bytes_read;
@@ -2064,6 +2103,12 @@ out_err:
if (cmsg_flags && copied >= 0) {
if (cmsg_flags & MPTCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
+
+ if (cmsg_flags & MPTCP_CMSG_INQ) {
+ unsigned int inq = mptcp_inq_hint(sk);
+
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
}
pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
@@ -2090,7 +2135,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
mptcp_schedule_work(sk);
} else {
/* delegate our work to tcp_release_cb() */
- set_bit(MPTCP_RETRANSMIT, &msk->flags);
+ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
}
bh_unlock_sock(sk);
sock_put(sk);
@@ -2198,6 +2243,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
return true;
}
+/* flags for __mptcp_close_ssk() */
+#define MPTCP_CF_PUSH BIT(1)
+#define MPTCP_CF_FASTCLOSE BIT(2)
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -2207,22 +2256,37 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
* parent socket.
*/
static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow)
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- bool need_push;
+ bool need_push, dispose_it;
- list_del(&subflow->node);
+ dispose_it = !msk->subflow || ssk != msk->subflow->sk;
+ if (dispose_it)
+ list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ if (flags & MPTCP_CF_FASTCLOSE)
+ subflow->send_fastclose = 1;
+
+ need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
+ if (!dispose_it) {
+ tcp_disconnect(ssk, 0);
+ msk->subflow->state = SS_UNCONNECTED;
+ mptcp_subflow_ctx_reset(subflow);
+ release_sock(ssk);
+
+ goto out;
+ }
+
/* if we are invoked by the msk cleanup code, the subflow is
* already orphaned
*/
if (ssk->sk_socket)
sock_orphan(ssk);
- need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2242,14 +2306,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
sock_put(ssk);
- if (ssk == msk->last_snd)
- msk->last_snd = NULL;
-
if (ssk == msk->first)
msk->first = NULL;
- if (msk->subflow && ssk == msk->subflow->sk)
- mptcp_dispose_initial_subflow(msk);
+out:
+ if (ssk == msk->last_snd)
+ msk->last_snd = NULL;
if (need_push)
__mptcp_push_pending(sk, 0);
@@ -2260,7 +2322,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
{
if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
- __mptcp_close_ssk(sk, ssk, subflow);
+
+ /* subflow aborted before reaching the fully_established status
+ * attempt the creation of the next subflow
+ */
+ mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+
+ __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
}
static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
@@ -2412,12 +2480,10 @@ static void mptcp_worker(struct work_struct *work)
goto unlock;
mptcp_check_data_fin_ack(sk);
- mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk);
- if (msk->pm.status)
- mptcp_pm_nl_work(msk);
+ mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
@@ -2451,8 +2517,6 @@ static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- spin_lock_init(&msk->join_list_lock);
-
INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
@@ -2478,9 +2542,20 @@ static int __mptcp_init_sock(struct sock *sk)
return 0;
}
-static int mptcp_init_sock(struct sock *sk)
+static void mptcp_ca_reset(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_assign_congestion_control(sk);
+ strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
+
+ /* no need to keep a reference to the ops, the name will suffice */
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = NULL;
+}
+
+static int mptcp_init_sock(struct sock *sk)
+{
struct net *net = sock_net(sk);
int ret;
@@ -2501,12 +2576,7 @@ static int mptcp_init_sock(struct sock *sk)
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
* propagate the correct value
*/
- tcp_assign_congestion_control(sk);
- strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
-
- /* no need to keep a reference to the ops, the name will suffice */
- tcp_cleanup_congestion_control(sk);
- icsk->icsk_ca_ops = NULL;
+ mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
@@ -2611,6 +2681,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
* state now
*/
if (__mptcp_check_fallback(msk)) {
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_close_wake_up(sk);
@@ -2619,7 +2690,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
}
}
- mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2652,21 +2722,20 @@ static void __mptcp_destroy_sock(struct sock *sk)
might_sleep();
- /* be sure to always acquire the join list lock, to sync vs
- * mptcp_finish_join().
- */
- spin_lock_bh(&msk->join_list_lock);
- list_splice_tail_init(&msk->join_list, &msk->conn_list);
- spin_unlock_bh(&msk->join_list_lock);
+ /* join list will be eventually flushed (with rst) at sock lock release time*/
list_splice_init(&msk->conn_list, &conn_list);
sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
+ /* clears msk->subflow, allowing the following loop to close
+ * even the initial subflow
+ */
+ mptcp_dispose_initial_subflow(msk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __mptcp_close_ssk(sk, ssk, subflow);
+ __mptcp_close_ssk(sk, ssk, subflow, 0);
}
sk->sk_prot->destroy(sk);
@@ -2677,7 +2746,6 @@ static void __mptcp_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
- mptcp_dispose_initial_subflow(msk);
sock_put(sk);
}
@@ -2713,6 +2781,9 @@ cleanup:
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
+ if (mptcp_sk(sk)->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
@@ -2723,9 +2794,6 @@ cleanup:
if (do_cancel_work)
mptcp_cancel_work(sk);
- if (mptcp_sk(sk)->token)
- mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
-
sock_put(sk);
}
@@ -2757,15 +2825,38 @@ static int mptcp_disconnect(struct sock *sk, int flags)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_do_flush_join_list(msk);
+ inet_sk_state_store(sk, TCP_CLOSE);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- lock_sock(ssk);
- tcp_disconnect(ssk, flags);
- release_sock(ssk);
+ __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
}
+
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+
+ if (mptcp_sk(sk)->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+
+ mptcp_destroy_common(msk);
+ msk->last_snd = NULL;
+ WRITE_ONCE(msk->flags, 0);
+ msk->cb_flags = 0;
+ msk->push_pending = 0;
+ msk->recovery = false;
+ msk->can_ack = false;
+ msk->fully_established = false;
+ msk->rcv_data_fin = false;
+ msk->snd_data_fin_enable = false;
+ msk->rcv_fastclose = false;
+ msk->use_64bit_ack = false;
+ WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ mptcp_pm_data_reset(msk);
+ mptcp_ca_reset(sk);
+
+ sk->sk_shutdown = 0;
+ sk_error_report(sk);
return 0;
}
@@ -2905,9 +2996,11 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
__mptcp_clear_xmit(sk);
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ mptcp_data_lock(sk);
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
+ mptcp_data_unlock(sk);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it
@@ -2931,7 +3024,7 @@ void __mptcp_data_acked(struct sock *sk)
if (!sock_owned_by_user(sk))
__mptcp_clean_una(sk);
else
- set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
if (mptcp_pending_data_fin_ack(sk))
mptcp_schedule_work(sk);
@@ -2950,20 +3043,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
else if (xmit_ssk)
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
} else {
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
}
}
+#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
+ BIT(MPTCP_RETRANSMIT) | \
+ BIT(MPTCP_FLUSH_JOIN_LIST))
+
/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
+ __must_hold(&sk->sk_lock.slock)
{
- for (;;) {
- unsigned long flags = 0;
+ struct mptcp_sock *msk = mptcp_sk(sk);
- if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
- flags |= BIT(MPTCP_PUSH_PENDING);
- if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags))
- flags |= BIT(MPTCP_RETRANSMIT);
+ for (;;) {
+ unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
+ msk->push_pending;
if (!flags)
break;
@@ -2974,8 +3070,11 @@ static void mptcp_release_cb(struct sock *sk)
* datapath acquires the msk socket spinlock while helding
* the subflow socket lock
*/
-
+ msk->push_pending = 0;
+ msk->cb_flags &= ~flags;
spin_unlock_bh(&sk->sk_lock.slock);
+ if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
+ __mptcp_flush_join_list(sk);
if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT))
@@ -2988,11 +3087,11 @@ static void mptcp_release_cb(struct sock *sk)
/* be sure to set the current sk state before tacking actions
* depending on sk_state
*/
- if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags))
+ if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
__mptcp_set_connected(sk);
- if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags))
+ if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
+ if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
__mptcp_update_rmem(sk);
@@ -3034,7 +3133,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk);
else
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);
}
@@ -3120,8 +3219,7 @@ bool mptcp_finish_join(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk;
- struct socket *parent_sock;
- bool ret;
+ bool ret = true;
pr_debug("msk=%p, subflow=%p", msk, subflow);
@@ -3134,35 +3232,38 @@ bool mptcp_finish_join(struct sock *ssk)
if (!msk->pm.server_side)
goto out;
- if (!mptcp_pm_allow_new_subflow(msk)) {
- subflow->reset_reason = MPTCP_RST_EPROHIBIT;
- return false;
- }
+ if (!mptcp_pm_allow_new_subflow(msk))
+ goto err_prohibited;
+
+ if (WARN_ON_ONCE(!list_empty(&subflow->node)))
+ goto err_prohibited;
- /* active connections are already on conn_list, and we can't acquire
- * msk lock here.
- * use the join list lock as synchronization point and double-check
- * msk status to avoid racing with __mptcp_destroy_sock()
+ /* active connections are already on conn_list.
+ * If we can't acquire msk socket lock here, let the release callback
+ * handle it
*/
- spin_lock_bh(&msk->join_list_lock);
- ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
- if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) {
- list_add_tail(&subflow->node, &msk->join_list);
+ mptcp_data_lock(parent);
+ if (!sock_owned_by_user(parent)) {
+ ret = __mptcp_finish_join(msk, ssk);
+ if (ret) {
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->conn_list);
+ }
+ } else {
sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->join_list);
+ __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
}
- spin_unlock_bh(&msk->join_list_lock);
+ mptcp_data_unlock(parent);
+
if (!ret) {
+err_prohibited:
subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
}
- /* attach to msk socket only after we are sure he will deal with us
- * at close time
- */
- parent_sock = READ_ONCE(parent->sk_socket);
- if (parent_sock && !ssk->sk_socket)
- mptcp_sock_graft(ssk, parent_sock);
subflow->map_seq = READ_ONCE(msk->ack_seq);
+
out:
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
return true;
@@ -3181,6 +3282,57 @@ static int mptcp_forward_alloc_get(const struct sock *sk)
return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
}
+static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
+{
+ const struct sock *sk = (void *)msk;
+ u64 delta;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return 0;
+
+ delta = msk->write_seq - v;
+ if (delta > INT_MAX)
+ delta = INT_MAX;
+
+ return (int)delta;
+}
+
+static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool slow;
+ int answ;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+ __mptcp_move_skbs(msk);
+ answ = mptcp_inq_hint(sk);
+ release_sock(sk);
+ break;
+ case SIOCOUTQ:
+ slow = lock_sock_fast(sk);
+ answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
+ unlock_sock_fast(sk, slow);
+ break;
+ case SIOCOUTQNSD:
+ slow = lock_sock_fast(sk);
+ answ = mptcp_ioctl_outq(msk, msk->snd_nxt);
+ unlock_sock_fast(sk, slow);
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
+}
+
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
@@ -3193,6 +3345,7 @@ static struct proto mptcp_prot = {
.shutdown = mptcp_shutdown,
.destroy = mptcp_destroy,
.sendmsg = mptcp_sendmsg,
+ .ioctl = mptcp_ioctl,
.recvmsg = mptcp_recvmsg,
.release_cb = mptcp_release_cb,
.hash = mptcp_hash,
@@ -3245,9 +3398,20 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
struct socket *ssock;
- int err;
+ int err = -EINVAL;
lock_sock(sock->sk);
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ goto unlock;
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = mptcp_disconnect(sock->sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto unlock;
+ }
+ }
+
if (sock->state != SS_UNCONNECTED && msk->subflow) {
/* pending connection or invalid state, let existing subflow
* cope with that
@@ -3257,10 +3421,8 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
}
ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
+ if (!ssock)
goto unlock;
- }
mptcp_token_destroy(msk);
inet_sk_state_store(sock->sk, TCP_SYN_SENT);
@@ -3334,17 +3496,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
pr_debug("msk=%p", msk);
- lock_sock(sock->sk);
- if (sock->sk->sk_state != TCP_LISTEN)
- goto unlock_fail;
-
ssock = __mptcp_nmpc_socket(msk);
if (!ssock)
- goto unlock_fail;
-
- clear_bit(MPTCP_DATA_READY, &msk->flags);
- sock_hold(ssock->sk);
- release_sock(sock->sk);
+ return -EINVAL;
err = ssock->ops->accept(sock, newsock, flags, kern);
if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
@@ -3374,7 +3528,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
- mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -3384,14 +3537,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
release_sock(newsk);
}
- if (inet_csk_listen_poll(ssock->sk))
- set_bit(MPTCP_DATA_READY, &msk->flags);
- sock_put(ssock->sk);
return err;
-
-unlock_fail:
- release_sock(sock->sk);
- return -EINVAL;
}
static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
@@ -3437,8 +3583,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
state = inet_sk_state_load(sk);
pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
- if (state == TCP_LISTEN)
- return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0;
+ if (state == TCP_LISTEN) {
+ if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk))
+ return 0;
+
+ return inet_csk_listen_poll(msk->subflow->sk);
+ }
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d87cc040352e..0e6b42c76ea0 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -110,19 +110,20 @@
/* MPTCP TCPRST flags */
#define MPTCP_RST_TRANSIENT BIT(0)
-/* MPTCP socket flags */
-#define MPTCP_DATA_READY 0
+/* MPTCP socket atomic flags */
#define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5
-#define MPTCP_PUSH_PENDING 6
-#define MPTCP_CLEAN_UNA 7
-#define MPTCP_ERROR_REPORT 8
-#define MPTCP_RETRANSMIT 9
-#define MPTCP_WORK_SYNC_SETSOCKOPT 10
-#define MPTCP_CONNECTED 11
+
+/* MPTCP socket release cb flags */
+#define MPTCP_PUSH_PENDING 1
+#define MPTCP_CLEAN_UNA 2
+#define MPTCP_ERROR_REPORT 3
+#define MPTCP_RETRANSMIT 4
+#define MPTCP_FLUSH_JOIN_LIST 5
+#define MPTCP_CONNECTED 6
static inline bool before64(__u64 seq1, __u64 seq2)
{
@@ -174,16 +175,25 @@ enum mptcp_pm_status {
MPTCP_PM_ADD_ADDR_SEND_ACK,
MPTCP_PM_RM_ADDR_RECEIVED,
MPTCP_PM_ESTABLISHED,
- MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
MPTCP_PM_SUBFLOW_ESTABLISHED,
+ MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
+ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is
+ * accounted int id_avail_bitmap
+ */
};
+/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
+#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
+
enum mptcp_addr_signal_status {
MPTCP_ADD_ADDR_SIGNAL,
MPTCP_ADD_ADDR_ECHO,
MPTCP_RM_ADDR_SIGNAL,
};
+/* max value of mptcp_addr_info.id */
+#define MPTCP_PM_MAX_ADDR_ID U8_MAX
+
struct mptcp_pm_data {
struct mptcp_addr_info local;
struct mptcp_addr_info remote;
@@ -202,6 +212,7 @@ struct mptcp_pm_data {
u8 local_addr_used;
u8 subflows;
u8 status;
+ DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_rm_list rm_list_tx;
struct mptcp_rm_list rm_list_rx;
};
@@ -241,6 +252,8 @@ struct mptcp_sock {
u32 token;
int rmem_released;
unsigned long flags;
+ unsigned long cb_flags;
+ unsigned long push_pending;
bool recovery; /* closing subflow write queue reinjected */
bool can_ack;
bool fully_established;
@@ -249,7 +262,9 @@ struct mptcp_sock {
bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
bool csum_enabled;
- spinlock_t join_list_lock;
+ u8 recvmsg_inq:1,
+ cork:1,
+ nodelay:1;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -392,6 +407,10 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
+
+ char reset_start[0];
+
+ unsigned long avg_pacing_rate; /* protected by msk socket lock */
u64 local_key;
u64 remote_key;
u64 idsn;
@@ -419,6 +438,7 @@ struct mptcp_subflow_context {
backup : 1,
send_mp_prio : 1,
send_mp_fail : 1,
+ send_fastclose : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1, /* ctx can be free at ulp release time */
@@ -437,6 +457,9 @@ struct mptcp_subflow_context {
u8 stale_count;
long delegated_status;
+
+ char reset_end[0];
+
struct list_head delegated_node; /* link into delegated_action, protected by local BH */
u32 setsockopt_seq;
@@ -468,6 +491,13 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
return subflow->tcp_sock;
}
+static inline void
+mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
+{
+ memset(subflow->reset_start, 0, subflow->reset_end - subflow->reset_start);
+ subflow->request_mptcp = 1;
+}
+
static inline u64
mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
{
@@ -482,15 +512,6 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
}
-static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- sock_hold(mptcp_subflow_tcp_sock(subflow));
- spin_lock_bh(&msk->join_list_lock);
- list_add_tail(&subflow->node, &msk->join_list);
- spin_unlock_bh(&msk->join_list_lock);
-}
-
void mptcp_subflow_process_delegated(struct sock *ssk);
static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action)
@@ -554,6 +575,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
+void mptcp_check_and_set_pending(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
@@ -654,7 +676,6 @@ void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
-void __mptcp_flush_join_list(struct mptcp_sock *msk);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{
return READ_ONCE(msk->snd_data_fin_enable) &&
@@ -704,9 +725,11 @@ void mptcp_token_destroy(struct mptcp_sock *msk);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
+u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
+void mptcp_pm_data_reset(struct mptcp_sock *msk);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
@@ -714,7 +737,9 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk,
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk);
-void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
+bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+ const struct mptcp_subflow_context *subflow);
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
@@ -811,7 +836,7 @@ unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
-void mptcp_sockopt_sync_all(struct mptcp_sock *msk);
+void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
{
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index f8efd478ac97..dacf3cee0027 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -390,6 +390,8 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
switch (optname) {
case IPV6_V6ONLY:
+ case IPV6_TRANSPARENT:
+ case IPV6_FREEBIND:
lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
if (!ssock) {
@@ -398,8 +400,24 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
}
ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
- if (ret == 0)
+ if (ret != 0) {
+ release_sock(sk);
+ return ret;
+ }
+
+ sockopt_seq_inc(msk);
+
+ switch (optname) {
+ case IPV6_V6ONLY:
sk->sk_ipv6only = ssock->sk->sk_ipv6only;
+ break;
+ case IPV6_TRANSPARENT:
+ inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent;
+ break;
+ case IPV6_FREEBIND:
+ inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind;
+ break;
+ }
release_sock(sk);
break;
@@ -538,6 +556,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
case TCP_TIMESTAMP:
case TCP_NOTSENT_LOWAT:
case TCP_TX_DELAY:
+ case TCP_INQ:
return true;
}
@@ -549,7 +568,6 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE,
* are not supported fastopen is currently unsupported
*/
- /* TCP_INQ is currently unsupported, needs some recvmsg work */
}
return false;
}
@@ -597,14 +615,171 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
return ret;
}
+static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ msk->cork = !!val;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ __tcp_sock_set_cork(ssk, !!val);
+ release_sock(ssk);
+ }
+ if (!val)
+ mptcp_check_and_set_pending(sk);
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ msk->nodelay = !!val;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ __tcp_sock_set_nodelay(ssk, !!val);
+ release_sock(ssk);
+ }
+ if (val)
+ mptcp_check_and_set_pending(sk);
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct inet_sock *issk;
+ struct socket *ssock;
+ int err;
+
+ err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
+ if (err != 0)
+ return err;
+
+ lock_sock(sk);
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ issk = inet_sk(ssock->sk);
+
+ switch (optname) {
+ case IP_FREEBIND:
+ issk->freebind = inet_sk(sk)->freebind;
+ break;
+ case IP_TRANSPARENT:
+ issk->transparent = inet_sk(sk)->transparent;
+ break;
+ default:
+ release_sock(sk);
+ WARN_ON_ONCE(1);
+ return -EOPNOTSUPP;
+ }
+
+ sockopt_seq_inc(msk);
+ release_sock(sk);
+ return 0;
+}
+
+static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int err, val;
+
+ err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
+
+ if (err != 0)
+ return err;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ val = inet_sk(sk)->tos;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ __ip_sock_set_tos(ssk, val);
+ }
+ release_sock(sk);
+
+ return err;
+}
+
+static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ switch (optname) {
+ case IP_FREEBIND:
+ case IP_TRANSPARENT:
+ return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen);
+ case IP_TOS:
+ return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
+ }
+
+ return -EOPNOTSUPP;
+}
+
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen)
{
+ struct sock *sk = (void *)msk;
+ int ret, val;
+
switch (optname) {
+ case TCP_INQ:
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ lock_sock(sk);
+ msk->recvmsg_inq = !!val;
+ release_sock(sk);
+ return 0;
case TCP_ULP:
return -EOPNOTSUPP;
case TCP_CONGESTION:
return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
+ case TCP_CORK:
+ return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
+ case TCP_NODELAY:
+ return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
}
return -EOPNOTSUPP;
@@ -636,6 +811,9 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname,
if (ssk)
return tcp_setsockopt(ssk, level, optname, optval, optlen);
+ if (level == SOL_IP)
+ return mptcp_setsockopt_v4(msk, optname, optval, optlen);
+
if (level == SOL_IPV6)
return mptcp_setsockopt_v6(msk, optname, optval, optlen);
@@ -931,6 +1109,35 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o
return 0;
}
+static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen, int val)
+{
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+ unsigned char ucval = (unsigned char)val;
+
+ len = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ucval, 1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, len, sizeof(int));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
char __user *optval, int __user *optlen)
{
@@ -941,7 +1148,26 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_CC_INFO:
return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
+ case TCP_INQ:
+ return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
+ case TCP_CORK:
+ return mptcp_put_int_option(msk, optval, optlen, msk->cork);
+ case TCP_NODELAY:
+ return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = (void *)msk;
+
+ switch (optname) {
+ case IP_TOS:
+ return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
}
+
return -EOPNOTSUPP;
}
@@ -980,6 +1206,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
if (ssk)
return tcp_getsockopt(ssk, level, optname, optval, option);
+ if (level == SOL_IP)
+ return mptcp_getsockopt_v4(msk, optname, optval, option);
if (level == SOL_TCP)
return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
if (level == SOL_MPTCP)
@@ -1002,6 +1230,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
ssk->sk_priority = sk->sk_priority;
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
+ __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
@@ -1027,6 +1256,11 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops)
tcp_set_congestion_control(ssk, msk->ca_name, false, true);
+ __tcp_sock_set_cork(ssk, !!msk->cork);
+ __tcp_sock_set_nodelay(ssk, !!msk->nodelay);
+
+ inet_sk(ssk)->transparent = inet_sk(sk)->transparent;
+ inet_sk(ssk)->freebind = inet_sk(sk)->freebind;
}
static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
@@ -1051,27 +1285,15 @@ void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
}
}
-void mptcp_sockopt_sync_all(struct mptcp_sock *msk)
+void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
{
- struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- u32 seq;
-
- seq = sockopt_seq_reset(sk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- u32 sseq = READ_ONCE(subflow->setsockopt_seq);
+ msk_owned_by_me(msk);
- if (sseq != msk->setsockopt_seq) {
- __mptcp_sockopt_sync(msk, ssk);
- WRITE_ONCE(subflow->setsockopt_seq, seq);
- } else if (sseq != seq) {
- WRITE_ONCE(subflow->setsockopt_seq, seq);
- }
+ if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
+ sync_socket_options(msk, ssk);
- cond_resched();
+ subflow->setsockopt_seq = msk->setsockopt_seq;
}
-
- msk->setsockopt_seq = seq;
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 6172f380dfb7..bea47a1180dc 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -388,7 +388,7 @@ static void mptcp_set_connected(struct sock *sk)
if (!sock_owned_by_user(sk))
__mptcp_set_connected(sk);
else
- set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
@@ -845,9 +845,8 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *
bool csum_reqd)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct csum_pseudo_header header;
u32 offset, seq, delta;
- __wsum csum;
+ u16 csum;
int len;
if (!csum_reqd)
@@ -908,13 +907,11 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *
* while the pseudo header requires the original DSS data len,
* including that
*/
- header.data_seq = cpu_to_be64(subflow->map_seq);
- header.subflow_seq = htonl(subflow->map_subflow_seq);
- header.data_len = htons(subflow->map_data_len + subflow->map_data_fin);
- header.csum = 0;
-
- csum = csum_partial(&header, sizeof(header), subflow->map_data_csum);
- if (unlikely(csum_fold(csum))) {
+ csum = __mptcp_make_csum(subflow->map_seq,
+ subflow->map_subflow_seq,
+ subflow->map_data_len + subflow->map_data_fin,
+ subflow->map_data_csum);
+ if (unlikely(csum)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
subflow->send_mp_fail = 1;
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX);
@@ -1274,7 +1271,7 @@ static void subflow_error_report(struct sock *ssk)
if (!sock_owned_by_user(sk))
__mptcp_error_report(sk);
else
- set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
@@ -1293,7 +1290,6 @@ static void subflow_data_ready(struct sock *sk)
if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
return;
- set_bit(MPTCP_DATA_READY, &msk->flags);
parent->sk_data_ready(parent);
return;
}
@@ -1425,6 +1421,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
+ mptcp_sockopt_sync(msk, ssk);
+
ssk->sk_bound_dev_if = ifindex;
err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
if (err)
@@ -1440,8 +1438,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
- mptcp_add_pending_subflow(msk, subflow);
- mptcp_sockopt_sync(msk, ssk);
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->conn_list);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS)
goto failed_unlink;
@@ -1452,9 +1450,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
return err;
failed_unlink:
- spin_lock_bh(&msk->join_list_lock);
list_del(&subflow->node);
- spin_unlock_bh(&msk->join_list_lock);
sock_put(mptcp_subflow_tcp_sock(subflow));
failed:
@@ -1533,10 +1529,8 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
* needs it.
*/
sf->sk->sk_net_refcnt = 1;
- get_net(net);
-#ifdef CONFIG_PROC_FS
- this_cpu_add(*net->core.sock_inuse, 1);
-#endif
+ get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
err = tcp_set_ulp(sf->sk, "mptcp");
release_sock(sf->sk);
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index e581b341c5be..f52ee7b26aed 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -384,6 +384,7 @@ void mptcp_token_destroy(struct mptcp_sock *msk)
bucket->chain_len--;
}
spin_unlock_bh(&bucket->lock);
+ WRITE_ONCE(msk->token, 0);
}
void __init mptcp_token_init(void)