From b841b901c452d92610f739a36e54978453528876 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:10 +0100 Subject: net: Declare MSG_SPLICE_PAGES internal sendmsg() flag Declare MSG_SPLICE_PAGES, an internal sendmsg() flag, that hints to a network protocol that it should splice pages from the source iterator rather than copying the data if it can. This flag is added to a list that is cleared by sendmsg syscalls on entry. This is intended as a replacement for the ->sendpage() op, allowing a way to splice in several multipage folios in one go. Signed-off-by: David Howells Reviewed-by: Willem de Bruijn cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/socket.h') diff --git a/include/linux/socket.h b/include/linux/socket.h index 13c3a237b9c9..bd1cc3238851 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -327,6 +327,7 @@ struct ucred { */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through @@ -337,6 +338,8 @@ struct ucred { #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif +/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ +#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 -- cgit v1.2.3 From 4fe38acdac8a71f7cccf347a2e9902bc818ecef7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:07 +0100 Subject: net: Block MSG_SENDPAGE_* from being passed to sendmsg() by userspace It is necessary to allow MSG_SENDPAGE_* to be passed into ->sendmsg() to allow sendmsg(MSG_SPLICE_PAGES) to replace ->sendpage(). Unblocking them in the network protocol, however, allows these flags to be passed in by userspace too[1]. Fix this by marking MSG_SENDPAGE_NOPOLICY, MSG_SENDPAGE_NOTLAST and MSG_SENDPAGE_DECRYPTED as internal flags, which causes sendmsg() to object if they are passed to sendmsg() by userspace. Network protocol ->sendmsg() implementations can then allow them through. Note that it should be possible to remove MSG_SENDPAGE_NOTLAST once sendpage is removed as a whole slew of pages will be passed in in one go by splice through sendmsg, with MSG_MORE being set if it has more data waiting in the pipe. Signed-off-by: David Howells cc: Chuck Lever cc: Boris Pismenny cc: John Fastabend cc: Jens Axboe cc: Matthew Wilcox Link: https://lore.kernel.org/r/20230526181338.03a99016@kernel.org/ [1] Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/socket.h') diff --git a/include/linux/socket.h b/include/linux/socket.h index bd1cc3238851..3fd3436bc09f 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -339,7 +339,9 @@ struct ucred { #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ -#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) +#define MSG_INTERNAL_SENDMSG_FLAGS \ + (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \ + MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 -- cgit v1.2.3 From 5e2ff6704a275be009be8979af17c52361b79b89 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 8 Jun 2023 22:26:25 +0200 Subject: scm: add SO_PASSPIDFD and SCM_PIDFD Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS, but it contains pidfd instead of plain pid, which allows programmers not to care about PID reuse problem. We mask SO_PASSPIDFD feature if CONFIG_UNIX is not builtin because it depends on a pidfd_prepare() API which is not exported to the kernel modules. Idea comes from UAPI kernel group: https://uapi-group.org/kernel-features/ Big thanks to Christian Brauner and Lennart Poettering for productive discussions about this. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: David Ahern Cc: Arnd Bergmann Cc: Kees Cook Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-arch@vger.kernel.org Tested-by: Luca Boccassi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/linux/net.h | 1 + include/linux/socket.h | 1 + include/net/scm.h | 39 +++++++++++++++++++++++++++++++-- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 11 ++++++++++ net/mptcp/sockopt.c | 1 + net/unix/af_unix.c | 18 ++++++++++----- tools/include/uapi/asm-generic/socket.h | 2 ++ 12 files changed, 76 insertions(+), 7 deletions(-) (limited to 'include/linux/socket.h') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 739891b94136..ff310613ae64 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -137,6 +137,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 18f3d95ecfec..762dcb80e4ec 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -148,6 +148,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index f486d3dfb6bb..df16a3e16d64 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -129,6 +129,8 @@ #define SO_RCVMARK 0x4049 +#define SO_PASSPIDFD 0x404A + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 2fda57a3ea86..6e2847804fea 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -130,6 +130,8 @@ #define SO_RCVMARK 0x0054 +#define SO_PASSPIDFD 0x0055 + #if !defined(__KERNEL__) diff --git a/include/linux/net.h b/include/linux/net.h index 8defc8f1d82e..23324e9a2b3d 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -43,6 +43,7 @@ struct net; #define SOCK_PASSSEC 4 #define SOCK_SUPPORT_ZC 5 #define SOCK_CUSTOM_SOCKOPT 6 +#define SOCK_PASSPIDFD 7 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/include/linux/socket.h b/include/linux/socket.h index 3fd3436bc09f..58204700018a 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg) #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ +#define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; diff --git a/include/net/scm.h b/include/net/scm.h index 585adc1346bd..c67f765a165b 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock) } #endif /* CONFIG_SECURITY_NETWORK */ +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int pidfd; + + /* + * put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if ((msg->msg_controllen <= sizeof(struct cmsghdr)) || + (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + WARN_ON_ONCE(!scm->pid); + pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp || - scm_has_secdata(sock)) + if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || + scm->fp || scm_has_secdata(sock)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return; @@ -141,6 +173,9 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } + if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + scm_pidfd_recv(msg, scm); + scm_destroy_cred(scm); scm_passec(sock, msg, scm); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 638230899e98..b76169fdb80b 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -132,6 +132,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 24f2761bdb1d..ed4eb4ba738b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1246,6 +1246,13 @@ set_sndbuf: clear_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + if (valbool) + set_bit(SOCK_PASSPIDFD, &sock->flags); + else + clear_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1732,6 +1739,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_PEERCRED: { struct ucred peercred; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4258869ac48..e172a5848b0d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -355,6 +355,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: + case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 653136d68b32..c46c2f5d860c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1361,7 +1361,8 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { err = unix_autobind(sk); if (err) @@ -1469,7 +1470,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -1670,6 +1672,8 @@ static void unix_sock_inherit_flags(const struct socket *old, { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); + if (test_bit(SOCK_PASSPIDFD, &old->flags)) + set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } @@ -1819,8 +1823,10 @@ static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags); + test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || + test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* @@ -1904,7 +1910,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -2718,7 +2725,8 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { + } else if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index 8756df13be50..fbbc4bf53ee3 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -121,6 +121,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3 From b848b26c6672c9b977890ba85f5a155e5eb221f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2023 23:55:13 +0100 Subject: net: Kill MSG_SENDPAGE_NOTLAST Now that ->sendpage() has been removed, MSG_SENDPAGE_NOTLAST can be cleaned up. Things were converted to use MSG_MORE instead, but the protocol sendpage stubs still convert MSG_SENDPAGE_NOTLAST to MSG_MORE, which is now unnecessary. Signed-off-by: David Howells cc: Jens Axboe cc: Matthew Wilcox cc: linux-afs@lists.infradead.org cc: mptcp@lists.linux.dev cc: rds-devel@oss.oracle.com cc: tipc-discussion@lists.sourceforge.net cc: virtualization@lists.linux-foundation.org Link: https://lore.kernel.org/r/20230623225513.2732256-17-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 4 +--- net/tls/tls_device.c | 3 +-- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 2 +- tools/perf/trace/beauty/include/linux/socket.h | 1 - tools/perf/trace/beauty/msg_flags.c | 5 +---- 6 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux/socket.h') diff --git a/include/linux/socket.h b/include/linux/socket.h index 58204700018a..39b74d83c7c4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -319,7 +319,6 @@ struct ucred { #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ -#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ @@ -341,8 +340,7 @@ struct ucred { /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ #define MSG_INTERNAL_SENDMSG_FLAGS \ - (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \ - MSG_SENDPAGE_DECRYPTED) + (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 840ee06f1708..2021fe557e50 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -441,8 +441,7 @@ static int tls_push_data(struct sock *sk, long timeo; if (flags & - ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | - MSG_SPLICE_PAGES)) + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES)) return -EOPNOTSUPP; if (unlikely(sk->sk_err)) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d5ed4d47b16e..b6896126bb92 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -127,7 +127,7 @@ int tls_push_sg(struct sock *sk, { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SENDPAGE_NOTLAST | MSG_SPLICE_PAGES | flags, + .msg_flags = MSG_SPLICE_PAGES | flags, }; int ret = 0; struct page *p; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9b3aa89a4292..53f944e6d8ef 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1194,7 +1194,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) + MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; ret = mutex_lock_interruptible(&tls_ctx->tx_lock); diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 13c3a237b9c9..3bef212a24d7 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -318,7 +318,6 @@ struct ucred { #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ -#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c index ea68db08b8e7..5cdebd7ece7e 100644 --- a/tools/perf/trace/beauty/msg_flags.c +++ b/tools/perf/trace/beauty/msg_flags.c @@ -8,9 +8,6 @@ #ifndef MSG_WAITFORONE #define MSG_WAITFORONE 0x10000 #endif -#ifndef MSG_SENDPAGE_NOTLAST -#define MSG_SENDPAGE_NOTLAST 0x20000 -#endif #ifndef MSG_FASTOPEN #define MSG_FASTOPEN 0x20000000 #endif @@ -50,7 +47,7 @@ static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size, P_MSG_FLAG(NOSIGNAL); P_MSG_FLAG(MORE); P_MSG_FLAG(WAITFORONE); - P_MSG_FLAG(SENDPAGE_NOTLAST); + P_MSG_FLAG(SPLICE_PAGES); P_MSG_FLAG(FASTOPEN); P_MSG_FLAG(CMSG_CLOEXEC); #undef P_MSG_FLAG -- cgit v1.2.3