summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/mptcp_diag.c7
-rw-r--r--net/mptcp/pm_netlink.c23
-rw-r--r--net/mptcp/protocol.c161
-rw-r--r--net/mptcp/protocol.h2
-rw-r--r--net/mptcp/sockopt.c19
5 files changed, 156 insertions, 56 deletions
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 7f9a71780437..8df1bdb647e2 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -81,15 +81,18 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct nlattr *bc = cb_data->inet_diag_nla_bc;
struct net *net = sock_net(skb->sk);
+ struct inet_hashinfo *hinfo;
int i;
- for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) {
+ hinfo = net->ipv4.tcp_death_row.hashinfo;
+
+ for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk;
int num = 0;
- ilb = &tcp_hashinfo.lhash2[i];
+ ilb = &hinfo->lhash2[i];
rcu_read_lock();
spin_lock(&ilb->lock);
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 291b5da42fdb..9813ed0fde9b 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -796,7 +796,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
u8 rm_id = rm_list->ids[i];
bool removed = false;
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
u8 id = subflow->local_id;
@@ -1327,7 +1327,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
@@ -2218,17 +2218,17 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
.doit = mptcp_nl_cmd_add_addr,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_DEL_ADDR,
.doit = mptcp_nl_cmd_del_addr,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
.doit = mptcp_nl_cmd_flush_addrs,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_ADDR,
@@ -2238,7 +2238,7 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_SET_LIMITS,
.doit = mptcp_nl_cmd_set_limits,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_LIMITS,
@@ -2247,27 +2247,27 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_SET_FLAGS,
.doit = mptcp_nl_cmd_set_flags,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_ANNOUNCE,
.doit = mptcp_nl_cmd_announce,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_REMOVE,
.doit = mptcp_nl_cmd_remove,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_SUBFLOW_CREATE,
.doit = mptcp_nl_cmd_sf_create,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY,
.doit = mptcp_nl_cmd_sf_destroy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
};
@@ -2280,6 +2280,7 @@ static struct genl_family mptcp_genl_family __ro_after_init = {
.module = THIS_MODULE,
.small_ops = mptcp_pm_ops,
.n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f8897a70c11d..f599ad44ed24 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -662,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
skb = skb_peek(&ssk->sk_receive_queue);
if (!skb) {
- /* if no data is found, a racing workqueue/recvmsg
- * already processed the new data, stop here or we
- * can enter an infinite loop
+ /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
+ * a different CPU can have already processed the pending
+ * data, stop here or we can enter an infinite loop
*/
if (!moved)
done = true;
@@ -672,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
if (__mptcp_check_fallback(msk)) {
- /* if we are running under the workqueue, TCP could have
- * collapsed skbs between dummy map creation and now
- * be sure to adjust the size
+ /* Under fallback skbs have no MPTCP extension and TCP could
+ * collapse them between the dummy map creation and the
+ * current dequeue. Be sure to adjust the map size.
*/
map_remaining = skb->len;
subflow->map_data_len = skb->len;
@@ -1544,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
struct mptcp_sendmsg_info info = {
.flags = flags,
};
+ bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len, copied = 0;
+ int len;
while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
@@ -1580,8 +1581,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
goto out;
}
+ do_check_data_fin = true;
info.sent += ret;
- copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
@@ -1597,7 +1598,7 @@ out:
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
- if (copied)
+ if (do_check_data_fin)
__mptcp_check_send_data_fin(sk);
}
@@ -1676,6 +1677,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
+ struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1689,14 +1691,39 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) {
+ struct sock *ssk = ssock->sk;
+ int copied_syn = 0;
+
+ lock_sock(ssk);
+
+ ret = tcp_sendmsg_fastopen(ssk, msg, &copied_syn, len, NULL);
+ copied += copied_syn;
+ if (ret == -EINPROGRESS && copied_syn > 0) {
+ /* reflect the new state on the MPTCP socket */
+ inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ release_sock(ssk);
+ goto out;
+ } else if (ret) {
+ release_sock(ssk);
+ goto do_error;
+ }
+ release_sock(ssk);
+ }
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
ret = sk_stream_wait_connect(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
+ ret = -EPIPE;
+ if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
+ goto do_error;
+
pfrag = sk_page_frag(sk);
while (msg_data_left(msg)) {
@@ -1705,11 +1732,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
bool dfrag_collapsed;
size_t psize, offset;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
- ret = -EPIPE;
- goto out;
- }
-
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
*/
@@ -1741,7 +1763,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
ret = -EFAULT;
- goto out;
+ goto do_error;
}
/* data successfully copied into the write queue */
@@ -1773,7 +1795,7 @@ wait_for_memory:
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
if (copied)
@@ -1781,7 +1803,14 @@ wait_for_memory:
out:
release_sock(sk);
- return copied ? : ret;
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+
+ copied = sk_stream_error(sk, msg->msg_flags, ret);
+ goto out;
}
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
@@ -2284,8 +2313,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
- if (flags & MPTCP_CF_FASTCLOSE)
+ if (flags & MPTCP_CF_FASTCLOSE) {
+ /* be sure to force the tcp_disconnect() path,
+ * to generate the egress reset
+ */
+ ssk->sk_lingertime = 0;
+ sock_set_flag(ssk, SOCK_LINGER);
subflow->send_fastclose = 1;
+ }
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
@@ -2363,7 +2398,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)
might_sleep();
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE)
@@ -2406,7 +2441,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
mptcp_token_destroy(msk);
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
bool slow;
@@ -2418,12 +2453,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
unlock_sock_fast(tcp_sk, slow);
}
+ /* Mirror the tcp_reset() error propagation */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+
inet_sk_state_store(sk, TCP_CLOSE);
sk->sk_shutdown = SHUTDOWN_MASK;
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
- mptcp_close_wake_up(sk);
+ /* the calling mptcp_worker will properly destroy the socket */
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ sk_error_report(sk);
}
static void __mptcp_retrans(struct sock *sk)
@@ -2529,6 +2583,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
mptcp_reset_timeout(msk, 0);
}
+static void mptcp_do_fastclose(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
+ subflow, MPTCP_CF_FASTCLOSE);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -2557,11 +2621,15 @@ static void mptcp_worker(struct work_struct *work)
* closed, but we need the msk around to reply to incoming DATA_FIN,
* even if it is orphaned and in FIN_WAIT2 state
*/
- if (sock_flag(sk, SOCK_DEAD) &&
- (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- __mptcp_destroy_sock(sk);
- goto unlock;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ if (mptcp_check_close_timeout(sk)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ }
+ if (sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
}
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2802,6 +2870,18 @@ static void __mptcp_destroy_sock(struct sock *sk)
sock_put(sk);
}
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ /* Concurrent splices from sk_receive_queue into receive_queue will
+ * always show at least one non-empty queue when checked in this order.
+ */
+ if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
+ skb_queue_empty_lockless(&msk->receive_queue))
+ return 0;
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
bool __mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
@@ -2815,8 +2895,13 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_close_state(sk))
+ if (mptcp_check_readable(msk)) {
+ /* the msk has read data, do the MPTCP equivalent of TCP reset */
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ } else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
+ }
sk_stream_wait_close(sk, timeout);
@@ -3063,7 +3148,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
__mptcp_clear_xmit(sk);
/* join list will be eventually flushed (with rst) at sock lock release time */
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node)
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
@@ -3535,6 +3620,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
+ inet_sk(sock->sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
sock->state = ssock->state;
/* on successful connect, the msk state will be moved to established by
@@ -3632,18 +3718,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
return err;
}
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
-{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
-
- return EPOLLIN | EPOLLRDNORM;
-}
-
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
@@ -3685,13 +3759,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
mask |= mptcp_check_writeable(msk);
+ } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* cf tcp_poll() note about TFO */
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
if (sk->sk_err)
mask |= EPOLLERR;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 8f372b8f059c..c0b5b4628f65 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -314,6 +314,8 @@ struct mptcp_sock {
#define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
+ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
static inline void msk_owned_by_me(const struct mptcp_sock *msk)
{
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 423d3826ca1e..c7cb68c725b2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -559,6 +559,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
case TCP_NOTSENT_LOWAT:
case TCP_TX_DELAY:
case TCP_INQ:
+ case TCP_FASTOPEN_CONNECT:
return true;
}
@@ -567,7 +568,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
* TCP_REPAIR_WINDOW are not supported, better avoid this mess
*/
- /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE,
+ /* TCP_FASTOPEN_KEY, TCP_FASTOPEN, TCP_FASTOPEN_NO_COOKIE,
* are not supported fastopen is currently unsupported
*/
}
@@ -768,6 +769,19 @@ static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optv
return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen);
}
+static int mptcp_setsockopt_sol_tcp_fastopen_connect(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct socket *sock;
+
+ /* Limit to first subflow */
+ sock = __mptcp_nmpc_socket(msk);
+ if (!sock)
+ return -EINVAL;
+
+ return tcp_setsockopt(sock->sk, SOL_TCP, TCP_FASTOPEN_CONNECT, optval, optlen);
+}
+
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -796,6 +810,8 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
case TCP_DEFER_ACCEPT:
return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen);
+ case TCP_FASTOPEN_CONNECT:
+ return mptcp_setsockopt_sol_tcp_fastopen_connect(msk, optval, optlen);
}
return -EOPNOTSUPP;
@@ -1157,6 +1173,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_INFO:
case TCP_CC_INFO:
case TCP_DEFER_ACCEPT:
+ case TCP_FASTOPEN_CONNECT:
return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
case TCP_INQ: