From 5e2ff6704a275be009be8979af17c52361b79b89 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 8 Jun 2023 22:26:25 +0200 Subject: scm: add SO_PASSPIDFD and SCM_PIDFD Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS, but it contains pidfd instead of plain pid, which allows programmers not to care about PID reuse problem. We mask SO_PASSPIDFD feature if CONFIG_UNIX is not builtin because it depends on a pidfd_prepare() API which is not exported to the kernel modules. Idea comes from UAPI kernel group: https://uapi-group.org/kernel-features/ Big thanks to Christian Brauner and Lennart Poettering for productive discussions about this. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: David Ahern Cc: Arnd Bergmann Cc: Kees Cook Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-arch@vger.kernel.org Tested-by: Luca Boccassi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/mptcp/sockopt.c') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4258869ac48..e172a5848b0d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -355,6 +355,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: + case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: -- cgit v1.2.3 From 38967f424b5be79c4c676712e5640d846efd07e3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:15 +0200 Subject: mptcp: track some aggregate data counters Currently there are no data transfer counters accounting for all the subflows used by a given MPTCP socket. The user-space can compute such figures aggregating the subflow info, but that is inaccurate if any subflow is closed before the MPTCP socket itself. Add the new counters in the MPTCP socket itself and expose them via the existing diag and sockopt. While touching mptcp_diag_fill_info(), acquire the relevant locks before fetching the msk data, to ensure better data consistency Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 5 +++++ net/mptcp/options.c | 10 ++++++++-- net/mptcp/protocol.c | 11 ++++++++++- net/mptcp/protocol.h | 4 ++++ net/mptcp/sockopt.c | 25 ++++++++++++++++++++----- 5 files changed, 47 insertions(+), 8 deletions(-) (limited to 'net/mptcp/sockopt.c') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 32af2d278cb4..a124be6ebbba 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -123,6 +123,11 @@ struct mptcp_info { __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; __u8 mptcpi_csum_enabled; + __u32 mptcpi_retransmits; + __u64 mptcpi_bytes_retrans; + __u64 mptcpi_bytes_sent; + __u64 mptcpi_bytes_received; + __u64 mptcpi_bytes_acked; }; /* diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4bdcd2b326bd..c254accb14de 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) return cur_seq; } +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) +{ + msk->bytes_acked += new_snd_una - msk->snd_una; + msk->snd_una = new_snd_una; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { - msk->snd_una = new_snd_una; + __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } mptcp_data_unlock(sk); @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ - msk->snd_una = READ_ONCE(msk->snd_nxt); + __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9c756d675d4d..d5b8e488bce1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ + msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } + msk->bytes_received += end_seq - msk->ack_seq; msk->ack_seq = end_seq; moved = true; } @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ - if (likely(after64(snd_nxt_new, msk->snd_nxt))) + if (likely(after64(snd_nxt_new, msk->snd_nxt))) { + msk->bytes_sent += snd_nxt_new - msk->snd_nxt; msk->snd_nxt = snd_nxt_new; + } } void mptcp_check_and_set_pending(struct sock *sk) @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk) } if (copied) { dfrag->already_sent = max(dfrag->already_sent, info.sent); + msk->bytes_retrans += copied; tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_acked = 0; + msk->bytes_received = 0; + msk->bytes_sent = 0; + msk->bytes_retrans = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 47b46602870e..27adfcc5aaa2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -262,10 +262,13 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 bytes_sent; u64 snd_nxt; + u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; + u64 bytes_retrans; int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; @@ -274,6 +277,7 @@ struct mptcp_sock { * recovery related fields are under data_lock * protection */ + u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index e172a5848b0d..fa5055d5b029 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -889,7 +889,9 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { + struct sock *sk = (struct sock *)msk; u32 flags = 0; + bool slow; memset(info, 0, sizeof(*info)); @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + if (inet_sk_state_load(sk) == TCP_LISTEN) + return; + /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_subflows_max = @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + mptcp_data_lock(sk); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); + + slow = lock_sock_fast(sk); + info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_token = msk->token; + info->mptcpi_write_seq = msk->write_seq; + info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; + info->mptcpi_bytes_sent = msk->bytes_sent; + info->mptcpi_bytes_received = msk->bytes_received; + info->mptcpi_bytes_retrans = msk->bytes_retrans; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); -- cgit v1.2.3 From 492432074e4fce4f8880213bf009b47adbf94a3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:18 +0200 Subject: mptcp: introduce MPTCP_FULL_INFO getsockopt Some user-space applications want to monitor the subflows utilization. Dumping the per subflow tcp_info is not enough, as the PM could close and re-create the subflows under-the-hood, fooling the accounting. Even checking the src/dst addresses used by each subflow could not be enough, because new subflows could re-use the same address/port of the just closed one. This patch introduces a new socket option, allow dumping all the relevant information all-at-once (everything, everywhere...), in a consistent manner. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/388 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 24 +++++++++ net/mptcp/sockopt.c | 127 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 2 deletions(-) (limited to 'net/mptcp/sockopt.c') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index a124be6ebbba..ee9c49f949a2 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -249,9 +249,33 @@ struct mptcp_subflow_addrs { }; }; +struct mptcp_subflow_info { + __u32 id; + struct mptcp_subflow_addrs addrs; +}; + +struct mptcp_full_info { + __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */ + __u32 size_tcpinfo_user; + __u32 size_sfinfo_kernel; /* must be 0, set by kernel */ + __u32 size_sfinfo_user; + __u32 num_subflows; /* must be 0, set by kernel (real subflow count) */ + __u32 size_arrays_user; /* max subflows that userspace is interested in; + * the buffers at subflow_info/tcp_info + * are respectively at least: + * size_arrays * size_sfinfo_user + * size_arrays * size_tcpinfo_user + * bytes wide + */ + __aligned_u64 subflow_info; + __aligned_u64 tcp_info; + struct mptcp_info mptcp_info; +}; + /* MPTCP socket options */ #define MPTCP_INFO 1 #define MPTCP_TCPINFO 2 #define MPTCP_SUBFLOW_ADDRS 3 +#define MPTCP_FULL_INFO 4 #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index fa5055d5b029..63f7a09335c5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,7 +14,8 @@ #include #include "protocol.h" -#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -981,7 +982,8 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, - char __user *optval, int __user *optlen) + char __user *optval, + int __user *optlen) { int len, copylen; @@ -1162,6 +1164,125 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_get_full_info(struct mptcp_full_info *mfi, + char __user *optval, + int __user *optlen) +{ + int len; + + BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != + MIN_FULL_INFO_OPTLEN_SIZE); + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < MIN_FULL_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(mfi, 0, sizeof(*mfi)); + if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) + return -EFAULT; + + if (mfi->size_tcpinfo_kernel || + mfi->size_sfinfo_kernel || + mfi->num_subflows) + return -EINVAL; + + if (mfi->size_sfinfo_user > INT_MAX || + mfi->size_tcpinfo_user > INT_MAX) + return -EINVAL; + + return len - MIN_FULL_INFO_OPTLEN_SIZE; +} + +static int mptcp_put_full_info(struct mptcp_full_info *mfi, + char __user *optval, + u32 copylen, + int __user *optlen) +{ + copylen += MIN_FULL_INFO_OPTLEN_SIZE; + if (put_user(copylen, optlen)) + return -EFAULT; + + if (copy_to_user(optval, mfi, copylen)) + return -EFAULT; + return 0; +} + +static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + unsigned int sfcount = 0, copylen = 0; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + void __user *tcpinfoptr, *sfinfoptr; + struct mptcp_full_info mfi; + int len; + + len = mptcp_get_full_info(&mfi, optval, optlen); + if (len < 0) + return len; + + /* don't bother filling the mptcp info if there is not enough + * user-space-provided storage + */ + if (len > 0) { + mptcp_diag_fill_info(msk, &mfi.mptcp_info); + copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); + } + + mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); + mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, + sizeof(struct tcp_info)); + sfinfoptr = u64_to_user_ptr(mfi.subflow_info); + mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); + mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, + sizeof(struct mptcp_subflow_info)); + tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_subflow_info sfinfo; + struct tcp_info tcp_info; + + if (sfcount++ >= mfi.size_arrays_user) + continue; + + /* fetch addr/tcp_info only if the user space buffers + * are wide enough + */ + memset(&sfinfo, 0, sizeof(sfinfo)); + sfinfo.id = subflow->subflow_id; + if (mfi.size_sfinfo_user > + offsetof(struct mptcp_subflow_info, addrs)) + mptcp_get_sub_addrs(ssk, &sfinfo.addrs); + if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) + goto fail_release; + + if (mfi.size_tcpinfo_user) { + tcp_get_info(ssk, &tcp_info); + if (copy_to_user(tcpinfoptr, &tcp_info, + mfi.size_tcpinfo_user)) + goto fail_release; + } + + tcpinfoptr += mfi.size_tcpinfo_user; + sfinfoptr += mfi.size_sfinfo_user; + } + release_sock(sk); + + mfi.num_subflows = sfcount; + if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) + return -EFAULT; + + return 0; + +fail_release: + release_sock(sk); + return -EFAULT; +} + static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { @@ -1235,6 +1356,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_FULL_INFO: + return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: -- cgit v1.2.3