From 78fa0d61d97a728d306b0c23d353c0e340756437 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:05 -0700 Subject: bpf, sockmap: Pass skb ownership through read_skb The read_skb hook calls consume_skb() now, but this means that if the recv_actor program wants to use the skb it needs to inc the ref cnt so that the consume_skb() doesn't kfree the sk_buff. This is problematic because in some error cases under memory pressure we may need to linearize the sk_buff from sk_psock_skb_ingress_enqueue(). Then we get this, skb_linearize() __pskb_pull_tail() pskb_expand_head() BUG_ON(skb_shared(skb)) Because we incremented users refcnt from sk_psock_verdict_recv() we hit the bug on with refcnt > 1 and trip it. To fix lets simply pass ownership of the sk_buff through the skb_read call. Then we can drop the consume from read_skb handlers and assume the verdict recv does any required kfree. Bug found while testing in our CI which runs in VMs that hit memory constraints rather regularly. William tested TCP read_skb handlers. [ 106.536188] ------------[ cut here ]------------ [ 106.536197] kernel BUG at net/core/skbuff.c:1693! [ 106.536479] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 106.536726] CPU: 3 PID: 1495 Comm: curl Not tainted 5.19.0-rc5 #1 [ 106.537023] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.16.0-1 04/01/2014 [ 106.537467] RIP: 0010:pskb_expand_head+0x269/0x330 [ 106.538585] RSP: 0018:ffffc90000138b68 EFLAGS: 00010202 [ 106.538839] RAX: 000000000000003f RBX: ffff8881048940e8 RCX: 0000000000000a20 [ 106.539186] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff8881048940e8 [ 106.539529] RBP: ffffc90000138be8 R08: 00000000e161fd1a R09: 0000000000000000 [ 106.539877] R10: 0000000000000018 R11: 0000000000000000 R12: ffff8881048940e8 [ 106.540222] R13: 0000000000000003 R14: 0000000000000000 R15: ffff8881048940e8 [ 106.540568] FS: 00007f277dde9f00(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 106.540954] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 106.541227] CR2: 00007f277eeede64 CR3: 000000000ad3e000 CR4: 00000000000006e0 [ 106.541569] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 106.541915] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 106.542255] Call Trace: [ 106.542383] [ 106.542487] __pskb_pull_tail+0x4b/0x3e0 [ 106.542681] skb_ensure_writable+0x85/0xa0 [ 106.542882] sk_skb_pull_data+0x18/0x20 [ 106.543084] bpf_prog_b517a65a242018b0_bpf_skskb_http_verdict+0x3a9/0x4aa9 [ 106.543536] ? migrate_disable+0x66/0x80 [ 106.543871] sk_psock_verdict_recv+0xe2/0x310 [ 106.544258] ? sk_psock_write_space+0x1f0/0x1f0 [ 106.544561] tcp_read_skb+0x7b/0x120 [ 106.544740] tcp_data_queue+0x904/0xee0 [ 106.544931] tcp_rcv_established+0x212/0x7c0 [ 106.545142] tcp_v4_do_rcv+0x174/0x2a0 [ 106.545326] tcp_v4_rcv+0xe70/0xf60 [ 106.545500] ip_protocol_deliver_rcu+0x48/0x290 [ 106.545744] ip_local_deliver_finish+0xa7/0x150 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reported-by: William Findlay Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-2-john.fastabend@gmail.com --- net/unix/af_unix.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cc695c9f09ec..e7728b57a8c7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2553,7 +2553,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; - int err, copied; + int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); @@ -2561,10 +2561,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (!skb) return err; - copied = recv_actor(sk, skb); - kfree_skb(skb); - - return copied; + return recv_actor(sk, skb); } /* -- cgit v1.2.3 From 96449f90240713bd9bd653d6b15266a1044cfa7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:11 +0100 Subject: net: Pass max frags into skb_append_pagefrags() Pass the maximum number of fragments into skb_append_pagefrags() rather than using MAX_SKB_FRAGS so that it can be used from code that wants to specify sysctl_max_skb_frags. Signed-off-by: David Howells cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/unix/af_unix.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8cff3d817131..15011408c47c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1383,7 +1383,7 @@ static inline int skb_pad(struct sk_buff *skb, int pad) #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size); + int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6724a84ebb09..7f53dcb26ad3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4188,13 +4188,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 61892268e8a6..52fc840898d8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1450,7 +1450,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (len > size) len = size; - if (skb_append_pagefrags(skb, page, offset, len)) { + if (skb_append_pagefrags(skb, page, offset, len, + MAX_SKB_FRAGS)) { err = -EMSGSIZE; goto error; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cc695c9f09ec..dd55506b4632 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2349,7 +2349,7 @@ alloc_skb: newskb = NULL; } - if (skb_append_pagefrags(skb, page, offset, size)) { + if (skb_append_pagefrags(skb, page, offset, size, MAX_SKB_FRAGS)) { tail = skb; goto alloc_skb; } -- cgit v1.2.3 From a0dbf5f818f9082d2262c1f8a8c0aab68364341f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:24 +0100 Subject: af_unix: Support MSG_SPLICE_PAGES Make AF_UNIX sendmsg() support MSG_SPLICE_PAGES, splicing in pages from the source iterator if possible and copying the data in otherwise. This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Signed-off-by: David Howells cc: Kuniyuki Iwashima cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dd55506b4632..976bc1c5e11b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2200,19 +2200,25 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, while (sent < len) { size = len - sent; - /* Keep two messages in the pipe so it schedules better */ - size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { + skb = sock_alloc_send_pskb(sk, 0, 0, + msg->msg_flags & MSG_DONTWAIT, + &err, 0); + } else { + /* Keep two messages in the pipe so it schedules better */ + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - /* allow fallback to order-0 allocations */ - size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); + data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); - skb = sock_alloc_send_pskb(sk, size - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err, - get_order(UNIX_SKB_FRAGS_SZ)); + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); + } if (!skb) goto out_err; @@ -2224,13 +2230,24 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } fds_sent = true; - skb_put(skb, size - data_len); - skb->data_len = data_len; - skb->len = size; - err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); - if (err) { - kfree_skb(skb); - goto out_err; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { + err = skb_splice_from_iter(skb, &msg->msg_iter, size, + sk->sk_allocation); + if (err < 0) { + kfree_skb(skb); + goto out_err; + } + size = err; + refcount_add(size, &sk->sk_wmem_alloc); + } else { + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) { + kfree_skb(skb); + goto out_err; + } } unix_state_lock(other); -- cgit v1.2.3 From 57d44a354a43edba4ef9963327d4657d12edbfbc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:25 +0100 Subject: unix: Convert unix_stream_sendpage() to use MSG_SPLICE_PAGES Convert unix_stream_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than directly splicing in the pages itself. This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Signed-off-by: David Howells cc: Kuniyuki Iwashima cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 134 +++-------------------------------------------------- 1 file changed, 7 insertions(+), 127 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 976bc1c5e11b..115436ce1f8a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1839,24 +1839,6 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, } } -static int maybe_init_creds(struct scm_cookie *scm, - struct socket *socket, - const struct sock *other) -{ - int err; - struct msghdr msg = { .msg_controllen = 0 }; - - err = scm_send(socket, &msg, scm, false); - if (err) - return err; - - if (unix_passcred_enabled(socket, other)) { - scm->pid = get_pid(task_tgid(current)); - current_uid_gid(&scm->creds.uid, &scm->creds.gid); - } - return err; -} - static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { @@ -2292,117 +2274,15 @@ out_err: static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, int offset, size_t size, int flags) { - int err; - bool send_sigpipe = false; - bool init_scm = true; - struct scm_cookie scm; - struct sock *other, *sk = socket->sk; - struct sk_buff *skb, *newskb = NULL, *tail = NULL; - - if (flags & MSG_OOB) - return -EOPNOTSUPP; + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - other = unix_peer(sk); - if (!other || sk->sk_state != TCP_ESTABLISHED) - return -ENOTCONN; - - if (false) { -alloc_skb: - unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->iolock); - newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, - &err, 0); - if (!newskb) - goto err; - } - - /* we must acquire iolock as we modify already present - * skbs in the sk_receive_queue and mess with skb->len - */ - err = mutex_lock_interruptible(&unix_sk(other)->iolock); - if (err) { - err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; - goto err; - } - - if (sk->sk_shutdown & SEND_SHUTDOWN) { - err = -EPIPE; - send_sigpipe = true; - goto err_unlock; - } - - unix_state_lock(other); + if (flags & MSG_SENDPAGE_NOTLAST) + msg.msg_flags |= MSG_MORE; - if (sock_flag(other, SOCK_DEAD) || - other->sk_shutdown & RCV_SHUTDOWN) { - err = -EPIPE; - send_sigpipe = true; - goto err_state_unlock; - } - - if (init_scm) { - err = maybe_init_creds(&scm, socket, other); - if (err) - goto err_state_unlock; - init_scm = false; - } - - skb = skb_peek_tail(&other->sk_receive_queue); - if (tail && tail == skb) { - skb = newskb; - } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { - if (newskb) { - skb = newskb; - } else { - tail = skb; - goto alloc_skb; - } - } else if (newskb) { - /* this is fast path, we don't necessarily need to - * call to kfree_skb even though with newskb == NULL - * this - does no harm - */ - consume_skb(newskb); - newskb = NULL; - } - - if (skb_append_pagefrags(skb, page, offset, size, MAX_SKB_FRAGS)) { - tail = skb; - goto alloc_skb; - } - - skb->len += size; - skb->data_len += size; - skb->truesize += size; - refcount_add(size, &sk->sk_wmem_alloc); - - if (newskb) { - err = unix_scm_to_skb(&scm, skb, false); - if (err) - goto err_state_unlock; - spin_lock(&other->sk_receive_queue.lock); - __skb_queue_tail(&other->sk_receive_queue, newskb); - spin_unlock(&other->sk_receive_queue.lock); - } - - unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->iolock); - - other->sk_data_ready(other); - scm_destroy(&scm); - return size; - -err_state_unlock: - unix_state_unlock(other); -err_unlock: - mutex_unlock(&unix_sk(other)->iolock); -err: - kfree_skb(newskb); - if (send_sigpipe && !(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - if (!init_scm) - scm_destroy(&scm); - return err; + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return unix_stream_sendmsg(socket, &msg, size); } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, -- cgit v1.2.3 From 5e2ff6704a275be009be8979af17c52361b79b89 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 8 Jun 2023 22:26:25 +0200 Subject: scm: add SO_PASSPIDFD and SCM_PIDFD Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS, but it contains pidfd instead of plain pid, which allows programmers not to care about PID reuse problem. We mask SO_PASSPIDFD feature if CONFIG_UNIX is not builtin because it depends on a pidfd_prepare() API which is not exported to the kernel modules. Idea comes from UAPI kernel group: https://uapi-group.org/kernel-features/ Big thanks to Christian Brauner and Lennart Poettering for productive discussions about this. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: David Ahern Cc: Arnd Bergmann Cc: Kees Cook Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-arch@vger.kernel.org Tested-by: Luca Boccassi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/linux/net.h | 1 + include/linux/socket.h | 1 + include/net/scm.h | 39 +++++++++++++++++++++++++++++++-- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 11 ++++++++++ net/mptcp/sockopt.c | 1 + net/unix/af_unix.c | 18 ++++++++++----- tools/include/uapi/asm-generic/socket.h | 2 ++ 12 files changed, 76 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 739891b94136..ff310613ae64 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -137,6 +137,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 18f3d95ecfec..762dcb80e4ec 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -148,6 +148,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index f486d3dfb6bb..df16a3e16d64 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -129,6 +129,8 @@ #define SO_RCVMARK 0x4049 +#define SO_PASSPIDFD 0x404A + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 2fda57a3ea86..6e2847804fea 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -130,6 +130,8 @@ #define SO_RCVMARK 0x0054 +#define SO_PASSPIDFD 0x0055 + #if !defined(__KERNEL__) diff --git a/include/linux/net.h b/include/linux/net.h index 8defc8f1d82e..23324e9a2b3d 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -43,6 +43,7 @@ struct net; #define SOCK_PASSSEC 4 #define SOCK_SUPPORT_ZC 5 #define SOCK_CUSTOM_SOCKOPT 6 +#define SOCK_PASSPIDFD 7 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/include/linux/socket.h b/include/linux/socket.h index 3fd3436bc09f..58204700018a 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg) #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ +#define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; diff --git a/include/net/scm.h b/include/net/scm.h index 585adc1346bd..c67f765a165b 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock) } #endif /* CONFIG_SECURITY_NETWORK */ +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int pidfd; + + /* + * put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if ((msg->msg_controllen <= sizeof(struct cmsghdr)) || + (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + WARN_ON_ONCE(!scm->pid); + pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp || - scm_has_secdata(sock)) + if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || + scm->fp || scm_has_secdata(sock)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return; @@ -141,6 +173,9 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } + if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + scm_pidfd_recv(msg, scm); + scm_destroy_cred(scm); scm_passec(sock, msg, scm); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 638230899e98..b76169fdb80b 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -132,6 +132,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 24f2761bdb1d..ed4eb4ba738b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1246,6 +1246,13 @@ set_sndbuf: clear_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + if (valbool) + set_bit(SOCK_PASSPIDFD, &sock->flags); + else + clear_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1732,6 +1739,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; + case SO_PASSPIDFD: + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + break; + case SO_PEERCRED: { struct ucred peercred; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4258869ac48..e172a5848b0d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -355,6 +355,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BROADCAST: case SO_BSDCOMPAT: case SO_PASSCRED: + case SO_PASSPIDFD: case SO_PASSSEC: case SO_RXQ_OVFL: case SO_WIFI_STATUS: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 653136d68b32..c46c2f5d860c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1361,7 +1361,8 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { err = unix_autobind(sk); if (err) @@ -1469,7 +1470,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -1670,6 +1672,8 @@ static void unix_sock_inherit_flags(const struct socket *old, { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); + if (test_bit(SOCK_PASSPIDFD, &old->flags)) + set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } @@ -1819,8 +1823,10 @@ static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags); + test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || + test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* @@ -1904,7 +1910,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + if ((test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); if (err) goto out; @@ -2718,7 +2725,8 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { + } else if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index 8756df13be50..fbbc4bf53ee3 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -121,6 +121,8 @@ #define SO_RCVMARK 75 +#define SO_PASSPIDFD 76 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3 From 7b26952a91cf65ff1cc867a2382a8964d8c0ee7d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 8 Jun 2023 22:26:26 +0200 Subject: net: core: add getsockopt SO_PEERPIDFD Add SO_PEERPIDFD which allows to get pidfd of peer socket holder pidfd. This thing is direct analog of SO_PEERCRED which allows to get plain PID. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: David Ahern Cc: Arnd Bergmann Cc: Kees Cook Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: Daniel Borkmann Cc: Stanislav Fomichev Cc: bpf@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-arch@vger.kernel.org Reviewed-by: Christian Brauner Acked-by: Stanislav Fomichev Tested-by: Luca Boccassi Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 1 + arch/mips/include/uapi/asm/socket.h | 1 + arch/parisc/include/uapi/asm/socket.h | 1 + arch/sparc/include/uapi/asm/socket.h | 1 + include/uapi/asm-generic/socket.h | 1 + net/core/sock.c | 33 +++++++++++++++++++++++++++++++++ net/unix/af_unix.c | 16 ++++++++++++++++ tools/include/uapi/asm-generic/socket.h | 1 + 8 files changed, 55 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index ff310613ae64..e94f621903fe 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -138,6 +138,7 @@ #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 +#define SO_PEERPIDFD 77 #if !defined(__KERNEL__) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 762dcb80e4ec..60ebaed28a4c 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -149,6 +149,7 @@ #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 +#define SO_PEERPIDFD 77 #if !defined(__KERNEL__) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index df16a3e16d64..be264c2b1a11 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -130,6 +130,7 @@ #define SO_RCVMARK 0x4049 #define SO_PASSPIDFD 0x404A +#define SO_PEERPIDFD 0x404B #if !defined(__KERNEL__) diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 6e2847804fea..682da3714686 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -131,6 +131,7 @@ #define SO_RCVMARK 0x0054 #define SO_PASSPIDFD 0x0055 +#define SO_PEERPIDFD 0x0056 #if !defined(__KERNEL__) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index b76169fdb80b..8ce8a39a1e5f 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -133,6 +133,7 @@ #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 +#define SO_PEERPIDFD 77 #if !defined(__KERNEL__) diff --git a/net/core/sock.c b/net/core/sock.c index ed4eb4ba738b..ea66b1afadd0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1758,6 +1758,39 @@ int sk_getsockopt(struct sock *sk, int level, int optname, goto lenout; } + case SO_PEERPIDFD: + { + struct pid *peer_pid; + struct file *pidfd_file = NULL; + int pidfd; + + if (len > sizeof(pidfd)) + len = sizeof(pidfd); + + spin_lock(&sk->sk_peer_lock); + peer_pid = get_pid(sk->sk_peer_pid); + spin_unlock(&sk->sk_peer_lock); + + if (!peer_pid) + return -ESRCH; + + pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + put_pid(peer_pid); + if (pidfd < 0) + return pidfd; + + if (copy_to_sockptr(optval, &pidfd, len) || + copy_to_sockptr(optlen, &len, sizeof(int))) { + put_unused_fd(pidfd); + fput(pidfd_file); + + return -EFAULT; + } + + fd_install(pidfd, pidfd_file); + return 0; + } + case SO_PEERGROUPS: { const struct cred *cred; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c46c2f5d860c..73c61a010b01 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -921,11 +921,26 @@ static void unix_unhash(struct sock *sk) */ } +static bool unix_bpf_bypass_getsockopt(int level, int optname) +{ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_PEERPIDFD: + return true; + default: + return false; + } + } + + return false; +} + struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, + .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif @@ -937,6 +952,7 @@ struct proto unix_stream_proto = { .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, + .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index fbbc4bf53ee3..54d9c8bf7c55 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -122,6 +122,7 @@ #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 +#define SO_PEERPIDFD 77 #if !defined(__KERNEL__) -- cgit v1.2.3 From 3f5f118bb657f94641ea383c7c1b8c09a5d46ea2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 22 Jun 2023 11:43:51 -0700 Subject: af_unix: Call scm_recv() only after scm_set_cred(). syzkaller hit a WARN_ON_ONCE(!scm->pid) in scm_pidfd_recv(). In unix_stream_read_generic(), if there is no skb in the queue, we could bail out the do-while loop without calling scm_set_cred(): 1. No skb in the queue 2. sk is non-blocking or shutdown(sk, RCV_SHUTDOWN) is called concurrently or peer calls close() If the socket is configured with SO_PASSCRED or SO_PASSPIDFD, scm_recv() would populate cmsg with garbage. Let's not call scm_recv() unless there is skb to receive. WARNING: CPU: 1 PID: 3245 at include/net/scm.h:138 scm_pidfd_recv include/net/scm.h:138 [inline] WARNING: CPU: 1 PID: 3245 at include/net/scm.h:138 scm_recv.constprop.0+0x754/0x850 include/net/scm.h:177 Modules linked in: CPU: 1 PID: 3245 Comm: syz-executor.1 Not tainted 6.4.0-rc5-01219-gfa0e21fa4443 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:scm_pidfd_recv include/net/scm.h:138 [inline] RIP: 0010:scm_recv.constprop.0+0x754/0x850 include/net/scm.h:177 Code: 67 fd e9 55 fd ff ff e8 4a 70 67 fd e9 7f fd ff ff e8 40 70 67 fd e9 3e fb ff ff e8 36 70 67 fd e9 02 fd ff ff e8 8c 3a 20 fd <0f> 0b e9 fe fb ff ff e8 50 70 67 fd e9 2e f9 ff ff e8 46 70 67 fd RSP: 0018:ffffc90009af7660 EFLAGS: 00010216 RAX: 00000000000000a1 RBX: ffff888041e58a80 RCX: ffffc90003852000 RDX: 0000000000040000 RSI: ffffffff842675b4 RDI: 0000000000000007 RBP: ffffc90009af7810 R08: 0000000000000007 R09: 0000000000000013 R10: 00000000000000f8 R11: 0000000000000001 R12: ffffc90009af7db0 R13: 0000000000000000 R14: ffff888041e58a88 R15: 1ffff9200135eecc FS: 00007f6b7113f640(0000) GS:ffff88806cf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6b7111de38 CR3: 0000000012a6e002 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: unix_stream_read_generic+0x5fe/0x1f50 net/unix/af_unix.c:2830 unix_stream_recvmsg+0x194/0x1c0 net/unix/af_unix.c:2880 sock_recvmsg_nosec net/socket.c:1019 [inline] sock_recvmsg+0x188/0x1d0 net/socket.c:1040 ____sys_recvmsg+0x210/0x610 net/socket.c:2712 ___sys_recvmsg+0xff/0x190 net/socket.c:2754 do_recvmmsg+0x25d/0x6c0 net/socket.c:2848 __sys_recvmmsg net/socket.c:2927 [inline] __do_sys_recvmmsg net/socket.c:2950 [inline] __se_sys_recvmmsg net/socket.c:2943 [inline] __x64_sys_recvmmsg+0x224/0x290 net/socket.c:2943 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3f/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f6b71da2e5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 73 9f 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007f6b7113ecc8 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00000000004bc050 RCX: 00007f6b71da2e5d RDX: 0000000000000007 RSI: 0000000020006600 RDI: 000000000000000b RBP: 00000000004bc050 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000120 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f6b71e03530 R15: 0000000000000000 Fixes: 5e2ff6704a27 ("scm: add SO_PASSPIDFD and SCM_PIDFD") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20230622184351.91544-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 73c61a010b01..f9d196439b49 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2826,7 +2826,7 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (state->msg) + if (state->msg && check_creds) scm_recv(sock, state->msg, &scm, flags); else scm_destroy(&scm); -- cgit v1.2.3 From dc97391e661009eab46783030d2404c9b6e6f2e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2023 23:55:12 +0100 Subject: sock: Remove ->sendpage*() in favour of sendmsg(MSG_SPLICE_PAGES) Remove ->sendpage() and ->sendpage_locked(). sendmsg() with MSG_SPLICE_PAGES should be used instead. This allows multiple pages and multipage folios to be passed through. Signed-off-by: David Howells Acked-by: Marc Kleine-Budde # for net/can cc: Jens Axboe cc: Matthew Wilcox cc: linux-afs@lists.infradead.org cc: mptcp@lists.linux.dev cc: rds-devel@oss.oracle.com cc: tipc-discussion@lists.sourceforge.net cc: virtualization@lists.linux-foundation.org Link: https://lore.kernel.org/r/20230623225513.2732256-16-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/bpf/map_sockmap.rst | 10 ++--- Documentation/filesystems/locking.rst | 2 - Documentation/filesystems/vfs.rst | 1 - Documentation/networking/scaling.rst | 4 +- crypto/af_alg.c | 28 ------------- crypto/algif_aead.c | 22 ++-------- crypto/algif_rng.c | 2 - crypto/algif_skcipher.c | 14 ------- .../ethernet/chelsio/inline_crypto/chtls/chtls.h | 2 - .../chelsio/inline_crypto/chtls/chtls_io.c | 14 ------- .../chelsio/inline_crypto/chtls/chtls_main.c | 1 - fs/nfsd/vfs.c | 2 +- include/crypto/if_alg.h | 2 - include/linux/net.h | 8 ---- include/net/inet_common.h | 2 - include/net/sock.h | 6 --- include/net/tcp.h | 4 -- net/appletalk/ddp.c | 1 - net/atm/pvc.c | 1 - net/atm/svc.c | 1 - net/ax25/af_ax25.c | 1 - net/caif/caif_socket.c | 2 - net/can/bcm.c | 1 - net/can/isotp.c | 1 - net/can/j1939/socket.c | 1 - net/can/raw.c | 1 - net/core/sock.c | 35 +--------------- net/dccp/ipv4.c | 1 - net/dccp/ipv6.c | 1 - net/ieee802154/socket.c | 2 - net/ipv4/af_inet.c | 21 ---------- net/ipv4/tcp.c | 43 ++----------------- net/ipv4/tcp_bpf.c | 23 +---------- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/udp.c | 15 ------- net/ipv4/udp_impl.h | 2 - net/ipv4/udplite.c | 1 - net/ipv6/af_inet6.c | 3 -- net/ipv6/raw.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/kcm/kcmsock.c | 20 --------- net/key/af_key.c | 1 - net/l2tp/l2tp_ip.c | 1 - net/l2tp/l2tp_ip6.c | 1 - net/llc/af_llc.c | 1 - net/mctp/af_mctp.c | 1 - net/mptcp/protocol.c | 2 - net/netlink/af_netlink.c | 1 - net/netrom/af_netrom.c | 1 - net/packet/af_packet.c | 2 - net/phonet/socket.c | 2 - net/qrtr/af_qrtr.c | 1 - net/rds/af_rds.c | 1 - net/rose/af_rose.c | 1 - net/rxrpc/af_rxrpc.c | 1 - net/sctp/protocol.c | 1 - net/socket.c | 48 ---------------------- net/tipc/socket.c | 3 -- net/tls/tls.h | 6 --- net/tls/tls_device.c | 17 -------- net/tls/tls_main.c | 7 ---- net/tls/tls_sw.c | 35 ---------------- net/unix/af_unix.c | 19 --------- net/vmw_vsock/af_vsock.c | 3 -- net/x25/af_x25.c | 1 - net/xdp/xsk.c | 1 - 66 files changed, 20 insertions(+), 442 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/Documentation/bpf/map_sockmap.rst b/Documentation/bpf/map_sockmap.rst index cc92047c6630..2d630686a00b 100644 --- a/Documentation/bpf/map_sockmap.rst +++ b/Documentation/bpf/map_sockmap.rst @@ -240,11 +240,11 @@ offsets into ``msg``, respectively. If a program of type ``BPF_PROG_TYPE_SK_MSG`` is run on a ``msg`` it can only parse data that the (``data``, ``data_end``) pointers have already consumed. For ``sendmsg()`` hooks this is likely the first scatterlist element. But for -calls relying on the ``sendpage`` handler (e.g., ``sendfile()``) this will be -the range (**0**, **0**) because the data is shared with user space and by -default the objective is to avoid allowing user space to modify data while (or -after) BPF verdict is being decided. This helper can be used to pull in data -and to set the start and end pointers to given values. Data will be copied if +calls relying on MSG_SPLICE_PAGES (e.g., ``sendfile()``) this will be the +range (**0**, **0**) because the data is shared with user space and by default +the objective is to avoid allowing user space to modify data while (or after) +BPF verdict is being decided. This helper can be used to pull in data and to +set the start and end pointers to given values. Data will be copied if necessary (i.e., if data was not linear and if start and end pointers do not point to the same chunk). diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa1a233b0fa8..ed148919e11a 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -521,8 +521,6 @@ prototypes:: int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, - loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 769be5230210..cb2a97e49872 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1086,7 +1086,6 @@ This describes how the VFS can manipulate an open file. As of kernel int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 3d435caa3ef2..92c9fb46d6a2 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -269,8 +269,8 @@ a single application thread handles flows with many different flow hashes. rps_sock_flow_table is a global flow table that contains the *desired* CPU for flows: the CPU that is currently processing the flow in userspace. Each table value is a CPU index that is updated during calls to recvmsg -and sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() -and tcp_splice_read()). +and sendmsg (specifically, inet_recvmsg(), inet_sendmsg() and +tcp_splice_read()). When the scheduler moves a thread to a new CPU while it has outstanding receive packets on the old CPU, packets may arrive out of order. To diff --git a/crypto/af_alg.c b/crypto/af_alg.c index cdb1dcc5dd1a..6218c773d71c 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -482,7 +482,6 @@ static const struct proto_ops alg_proto_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, @@ -1106,33 +1105,6 @@ unlock: } EXPORT_SYMBOL_GPL(af_alg_sendmsg); -/** - * af_alg_sendpage - sendpage system call handler - * @sock: socket of connection to user space to write to - * @page: data to send - * @offset: offset into page to begin sending - * @size: length of data - * @flags: message send/receive flags - * - * This is a generic implementation of sendpage to fill ctx->tsgl_list. - */ -ssize_t af_alg_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { - .msg_flags = flags | MSG_SPLICE_PAGES, - }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return sock_sendmsg(sock, &msg); -} -EXPORT_SYMBOL_GPL(af_alg_sendpage); - /** * af_alg_free_resources - release resources required for crypto request * @areq: Request holding the TX and RX SGL diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 35bfa283748d..7d58cbbce4af 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -9,10 +9,10 @@ * The following concept of the memory management is used: * * The kernel maintains two SGLs, the TX SGL and the RX SGL. The TX SGL is - * filled by user space with the data submitted via sendpage. Filling up - * the TX SGL does not cause a crypto operation -- the data will only be - * tracked by the kernel. Upon receipt of one recvmsg call, the caller must - * provide a buffer which is tracked with the RX SGL. + * filled by user space with the data submitted via sendmsg (maybe with + * MSG_SPLICE_PAGES). Filling up the TX SGL does not cause a crypto operation + * -- the data will only be tracked by the kernel. Upon receipt of one recvmsg + * call, the caller must provide a buffer which is tracked with the RX SGL. * * During the processing of the recvmsg operation, the cipher request is * allocated and prepared. As part of the recvmsg operation, the processed @@ -370,7 +370,6 @@ static struct proto_ops algif_aead_ops = { .release = af_alg_release, .sendmsg = aead_sendmsg, - .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, .poll = af_alg_poll, }; @@ -422,18 +421,6 @@ static int aead_sendmsg_nokey(struct socket *sock, struct msghdr *msg, return aead_sendmsg(sock, msg, size); } -static ssize_t aead_sendpage_nokey(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - int err; - - err = aead_check_key(sock); - if (err) - return err; - - return af_alg_sendpage(sock, page, offset, size, flags); -} - static int aead_recvmsg_nokey(struct socket *sock, struct msghdr *msg, size_t ignored, int flags) { @@ -461,7 +448,6 @@ static struct proto_ops algif_aead_ops_nokey = { .release = af_alg_release, .sendmsg = aead_sendmsg_nokey, - .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, .poll = af_alg_poll, }; diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c index 407408c43730..10c41adac3b1 100644 --- a/crypto/algif_rng.c +++ b/crypto/algif_rng.c @@ -174,7 +174,6 @@ static struct proto_ops algif_rng_ops = { .bind = sock_no_bind, .accept = sock_no_accept, .sendmsg = sock_no_sendmsg, - .sendpage = sock_no_sendpage, .release = af_alg_release, .recvmsg = rng_recvmsg, @@ -192,7 +191,6 @@ static struct proto_ops __maybe_unused algif_rng_test_ops = { .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, - .sendpage = sock_no_sendpage, .release = af_alg_release, .recvmsg = rng_test_recvmsg, diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index b1f321b9f846..9ada9b741af8 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -194,7 +194,6 @@ static struct proto_ops algif_skcipher_ops = { .release = af_alg_release, .sendmsg = skcipher_sendmsg, - .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, .poll = af_alg_poll, }; @@ -246,18 +245,6 @@ static int skcipher_sendmsg_nokey(struct socket *sock, struct msghdr *msg, return skcipher_sendmsg(sock, msg, size); } -static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - int err; - - err = skcipher_check_key(sock); - if (err) - return err; - - return af_alg_sendpage(sock, page, offset, size, flags); -} - static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg, size_t ignored, int flags) { @@ -285,7 +272,6 @@ static struct proto_ops algif_skcipher_ops_nokey = { .release = af_alg_release, .sendmsg = skcipher_sendmsg_nokey, - .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, .poll = af_alg_poll, }; diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h index da4818d2c856..68562a82d036 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h @@ -569,8 +569,6 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void chtls_splice_eof(struct socket *sock); -int chtls_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int send_tx_flowc_wr(struct sock *sk, int compl, u32 snd_nxt, u32 rcv_nxt); void chtls_tcp_push(struct sock *sk, int flags); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index e08ac960c967..5fc64e47568a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -1246,20 +1246,6 @@ void chtls_splice_eof(struct socket *sock) release_sock(sk); } -int chtls_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - struct bio_vec bvec; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return chtls_sendmsg(sk, &msg, size); -} - static void chtls_select_window(struct sock *sk) { struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 6b6787eafd2f..455a54708be4 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -607,7 +607,6 @@ static void __init chtls_init_ulp_ops(void) chtls_cpl_prot.shutdown = chtls_shutdown; chtls_cpl_prot.sendmsg = chtls_sendmsg; chtls_cpl_prot.splice_eof = chtls_splice_eof; - chtls_cpl_prot.sendpage = chtls_sendpage; chtls_cpl_prot.recvmsg = chtls_recvmsg; chtls_cpl_prot.setsockopt = chtls_setsockopt; chtls_cpl_prot.getsockopt = chtls_getsockopt; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index db67f8e19344..8879e207ff5a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -936,7 +936,7 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, /* * Grab and keep cached pages associated with a file in the svc_rqst - * so that they can be passed to the network sendmsg/sendpage routines + * so that they can be passed to the network sendmsg routines * directly. They will be released after the sending has completed. * * Return values: Number of bytes consumed, or -EIO if there are no diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 34224e77f5a2..ef8ce86b1f78 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -229,8 +229,6 @@ void af_alg_wmem_wakeup(struct sock *sk); int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min); int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize); -ssize_t af_alg_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(void *data, int err); __poll_t af_alg_poll(struct file *file, struct socket *sock, diff --git a/include/linux/net.h b/include/linux/net.h index 23324e9a2b3d..41c608c1b02c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -207,8 +207,6 @@ struct proto_ops { size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); - ssize_t (*sendpage) (struct socket *sock, struct page *page, - int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void (*splice_eof)(struct socket *sock); @@ -222,8 +220,6 @@ struct proto_ops { sk_read_actor_t recv_actor); /* This is different from read_sock(), it reads an entire skb at a time. */ int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); - int (*sendpage_locked)(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); @@ -341,10 +337,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); -int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); /* Routine returns the IP overhead imposed by a (caller-protected) socket. */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index a75333342c4e..b86b8e21de7f 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -36,8 +36,6 @@ void __inet_accept(struct socket *sock, struct socket *newsock, int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); void inet_splice_eof(struct socket *sock); -ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); diff --git a/include/net/sock.h b/include/net/sock.h index 62a1b99da349..121284f455a8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1277,8 +1277,6 @@ struct proto { size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); - int (*sendpage)(struct sock *sk, struct page *page, - int offset, size_t size, int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); @@ -1919,10 +1917,6 @@ int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); /* * Functions to fill in entries in struct proto_ops when a protocol diff --git a/include/net/tcp.h b/include/net/tcp.h index 31b534370787..226bce6d1e8c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -329,10 +329,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); void tcp_splice_eof(struct socket *sock); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); -int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index a06f4d4a6f47..8978fb6212ff 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1929,7 +1929,6 @@ static const struct proto_ops atalk_dgram_ops = { .sendmsg = atalk_sendmsg, .recvmsg = atalk_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block ddp_notifier = { diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 53e7d3f39e26..66d9a9bd5896 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -126,7 +126,6 @@ static const struct proto_ops pvc_proto_ops = { .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; diff --git a/net/atm/svc.c b/net/atm/svc.c index d83556d8beb9..36a814f1fbd1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -654,7 +654,6 @@ static const struct proto_ops svc_proto_ops = { .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d8da400cb4de..5db805d5f74d 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -2022,7 +2022,6 @@ static const struct proto_ops ax25_proto_ops = { .sendmsg = ax25_sendmsg, .recvmsg = ax25_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4eebcc66c19a..9c82698da4f5 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -976,7 +976,6 @@ static const struct proto_ops caif_seqpacket_ops = { .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct proto_ops caif_stream_ops = { @@ -996,7 +995,6 @@ static const struct proto_ops caif_stream_ops = { .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* This function is called when a socket is finally destroyed. */ diff --git a/net/can/bcm.c b/net/can/bcm.c index a962ec2b8ba5..9ba35685b043 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1703,7 +1703,6 @@ static const struct proto_ops bcm_ops = { .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto bcm_proto __read_mostly = { diff --git a/net/can/isotp.c b/net/can/isotp.c index 84f9aba02901..1f25b45868cf 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1699,7 +1699,6 @@ static const struct proto_ops isotp_ops = { .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto isotp_proto __read_mostly = { diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 35970c25496a..feaec4ad6d16 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1306,7 +1306,6 @@ static const struct proto_ops j1939_ops = { .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto j1939_proto __read_mostly = { diff --git a/net/can/raw.c b/net/can/raw.c index f64469b98260..15c79b079184 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -962,7 +962,6 @@ static const struct proto_ops raw_ops = { .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto raw_proto __read_mostly = { diff --git a/net/core/sock.c b/net/core/sock.c index 5f1747c12004..de719094b804 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3261,36 +3261,6 @@ void __receive_sock(struct file *file) } } -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg(sock, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage); - -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage_locked); - /* * Default Socket Callbacks */ @@ -4046,7 +4016,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), @@ -4067,7 +4037,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), - proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), @@ -4088,7 +4057,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) "maxhdr", "slab", "module", - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3ab68415d121..fa8079303cb0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1010,7 +1010,6 @@ static const struct proto_ops inet_dccp_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct inet_protosw dccp_v4_protosw = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 93c98990d726..7249ef218178 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1087,7 +1087,6 @@ static const struct proto_ops inet6_dccp_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 9c124705120d..00302e8b9615 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -426,7 +426,6 @@ static const struct proto_ops ieee802154_raw_ops = { .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* DGRAM Sockets (802.15.4 dataframes) */ @@ -989,7 +988,6 @@ static const struct proto_ops ieee802154_dgram_ops = { .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static void ieee802154_sock_destruct(struct sock *sk) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 38e649fb4474..9b2ca2fcc5a1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -847,23 +847,6 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); -ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) -{ - struct sock *sk = sock->sk; - const struct proto *prot; - - if (unlikely(inet_send_prepare(sk))) - return -EAGAIN; - - /* IPV6_ADDRFORM can change sk->sk_prot under us. */ - prot = READ_ONCE(sk->sk_prot); - if (prot->sendpage) - return prot->sendpage(sk, page, offset, size, flags); - return sock_no_sendpage(sock, page, offset, size, flags); -} -EXPORT_SYMBOL(inet_sendpage); - INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, @@ -1067,12 +1050,10 @@ const struct proto_ops inet_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, - .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, @@ -1102,7 +1083,6 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, @@ -1134,7 +1114,6 @@ static const struct proto_ops inet_sockraw_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d56edc2c885f..e03e08745308 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -923,11 +923,10 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -/* In some cases, both sendpage() and sendmsg() could have added - * an skb to the write queue, but failed adding payload on it. - * We need to remove it to consume less memory, but more - * importantly be able to generate EPOLLOUT for Edge Trigger epoll() - * users. +/* In some cases, both sendmsg() could have added an skb to the write queue, + * but failed adding payload on it. We need to remove it to consume less + * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger + * epoll() users. */ void tcp_remove_empty_skb(struct sock *sk) { @@ -975,40 +974,6 @@ int tcp_wmem_schedule(struct sock *sk, int copy) return min(copy, sk->sk_forward_alloc); } -int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (!(sk->sk_route_caps & NETIF_F_SG)) - return sock_no_sendpage_locked(sk, page, offset, size, flags); - - tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - return tcp_sendmsg_locked(sk, &msg, size); -} -EXPORT_SYMBOL_GPL(tcp_sendpage_locked); - -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - int ret; - - lock_sock(sk); - ret = tcp_sendpage_locked(sk, page, offset, size, flags); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL(tcp_sendpage); - void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 31d6005cea9b..81f0dff69e0b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -486,7 +486,7 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) long timeo; int flags; - /* Don't let internal sendpage flags through */ + /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; @@ -566,23 +566,6 @@ out_err: return copied ? copied : err; } -static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { - .msg_flags = flags | MSG_SPLICE_PAGES, - }; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - return tcp_bpf_sendmsg(sk, &msg, size); -} - enum { TCP_BPF_IPV4, TCP_BPF_IPV6, @@ -612,7 +595,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; - prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; @@ -647,8 +629,7 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && - ops->sendmsg == tcp_sendmsg && - ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; + ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9213804b034f..fd365de4d5ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3117,7 +3117,6 @@ struct proto tcp_prot = { .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, - .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 48fdcd3cad9c..42a96b3547c9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1340,20 +1340,6 @@ void udp_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(udp_splice_eof); -int udp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return udp_sendmsg(sk, &msg, size); -} - #define UDP_SKB_IS_STATELESS 0x80000000 /* all head states (dst, sk, nf conntrack) except skb extensions are @@ -2933,7 +2919,6 @@ struct proto udp_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .splice_eof = udp_splice_eof, - .sendpage = udp_sendpage, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 4ba7a88a1b1d..e1ff3a375996 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -19,8 +19,6 @@ int udp_getsockopt(struct sock *sk, int level, int optname, int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); -int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 143f93a12f25..39ecdad1b50c 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -56,7 +56,6 @@ struct proto udplite_prot = { .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, - .sendpage = udp_sendpage, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b3451cf47d29..5d593ddc0347 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -696,9 +696,7 @@ const struct proto_ops inet6_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, - .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, @@ -729,7 +727,6 @@ const struct proto_ops inet6_dgram_ops = { .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_skb = udp_read_skb, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index c9caeb5a43ed..ac1cef094c5f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1296,7 +1296,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c17c8ff94b79..40dd92a2f480 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2151,7 +2151,6 @@ struct proto tcpv6_prot = { .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, - .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d0537c1c8cd7..393f01b2a7e6 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -963,24 +963,6 @@ static void kcm_splice_eof(struct socket *sock) release_sock(sk); } -static ssize_t kcm_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) - -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - if (flags & MSG_OOB) - return -EOPNOTSUPP; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return kcm_sendmsg(sock, &msg, size); -} - static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -1769,7 +1751,6 @@ static const struct proto_ops kcm_dgram_ops = { .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, - .sendpage = kcm_sendpage, }; static const struct proto_ops kcm_seqpacket_ops = { @@ -1791,7 +1772,6 @@ static const struct proto_ops kcm_seqpacket_ops = { .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, - .sendpage = kcm_sendpage, .splice_read = kcm_splice_read, }; diff --git a/net/key/af_key.c b/net/key/af_key.c index 31ab12fd720a..ede3c6a60353 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3761,7 +3761,6 @@ static const struct proto_ops pfkey_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, /* Now the operations that really occur. */ .release = pfkey_release, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 2b795c1064f5..f9073bc7281f 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -624,7 +624,6 @@ static const struct proto_ops l2tp_ip_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct inet_protosw l2tp_ip_protosw = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 5137ea1861ce..b1623f9c4f92 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -751,7 +751,6 @@ static const struct proto_ops l2tp_ip6_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 9ffbc667be6c..57c35c960b2c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1232,7 +1232,6 @@ static const struct proto_ops llc_ui_ops = { .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const char llc_proc_err_msg[] __initconst = diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index bb4bd0b6a4f7..f6be58b68c6f 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -485,7 +485,6 @@ static const struct proto_ops mctp_dgram_ops = { .sendmsg = mctp_sendmsg, .recvmsg = mctp_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = mctp_compat_ioctl, #endif diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bd023debedc8..e892673deb73 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3866,7 +3866,6 @@ static const struct proto_ops mptcp_stream_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = inet_sendpage, }; static struct inet_protosw mptcp_protosw = { @@ -3961,7 +3960,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, - .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index cbd9aa7ee24a..39cfb778ebc5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2815,7 +2815,6 @@ static const struct proto_ops netlink_ops = { .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct net_proto_family netlink_family_ops = { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 5a4cb796150f..eb8ccbd58df7 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1364,7 +1364,6 @@ static const struct proto_ops nr_proto_ops = { .sendmsg = nr_sendmsg, .recvmsg = nr_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block nr_dev_notifier = { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a2dbeb264f26..85ff90a03b0c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4621,7 +4621,6 @@ static const struct proto_ops packet_ops_spkt = { .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct proto_ops packet_ops = { @@ -4643,7 +4642,6 @@ static const struct proto_ops packet_ops = { .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, - .sendpage = sock_no_sendpage, }; static const struct net_proto_family packet_family_ops = { diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 967f9b4dc026..1018340d89a7 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -441,7 +441,6 @@ const struct proto_ops phonet_dgram_ops = { .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; const struct proto_ops phonet_stream_ops = { @@ -462,7 +461,6 @@ const struct proto_ops phonet_stream_ops = { .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; EXPORT_SYMBOL(phonet_stream_ops); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 76f0434d3d06..78beb74146e7 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -1244,7 +1244,6 @@ static const struct proto_ops qrtr_proto_ops = { .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto qrtr_proto = { diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 3ff6995244e5..01c4cdfef45d 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -653,7 +653,6 @@ static const struct proto_ops rds_proto_ops = { .sendmsg = rds_sendmsg, .recvmsg = rds_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static void rds_sock_destruct(struct sock *sk) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ca2b17f32670..49dafe9ac72f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1496,7 +1496,6 @@ static const struct proto_ops rose_proto_ops = { .sendmsg = rose_sendmsg, .recvmsg = rose_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block rose_dev_notifier = { diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index da0b3b5157d5..f2cf4aa99db2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -954,7 +954,6 @@ static const struct proto_ops rxrpc_rpc_ops = { .sendmsg = rxrpc_sendmsg, .recvmsg = rxrpc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto rxrpc_proto = { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 664d1f2e9121..274d07bd774f 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1133,7 +1133,6 @@ static const struct proto_ops inet_seqpacket_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* Registration with AF_INET family. */ diff --git a/net/socket.c b/net/socket.c index b778fc03c6e0..8c3c8b29995a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3552,54 +3552,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_sendpage - send a &page through a socket (kernel space) - * @sock: socket - * @page: page - * @offset: page offset - * @size: total size in bytes - * @flags: flags (MSG_DONTWAIT, ...) - * - * Returns the total amount sent in bytes or an error. - */ - -int kernel_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) -{ - if (sock->ops->sendpage) { - /* Warn in case the improper page to zero-copy send */ - WARN_ONCE(!sendpage_ok(page), "improper page for zero-copy send"); - return sock->ops->sendpage(sock, page, offset, size, flags); - } - return sock_no_sendpage(sock, page, offset, size, flags); -} -EXPORT_SYMBOL(kernel_sendpage); - -/** - * kernel_sendpage_locked - send a &page through the locked sock (kernel space) - * @sk: sock - * @page: page - * @offset: page offset - * @size: total size in bytes - * @flags: flags (MSG_DONTWAIT, ...) - * - * Returns the total amount sent in bytes or an error. - * Caller must hold @sk. - */ - -int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct socket *sock = sk->sk_socket; - - if (sock->ops->sendpage_locked) - return sock->ops->sendpage_locked(sk, page, offset, size, - flags); - - return sock_no_sendpage_locked(sk, page, offset, size, flags); -} -EXPORT_SYMBOL(kernel_sendpage_locked); - /** * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket diff --git a/net/tipc/socket.c b/net/tipc/socket.c index dd73d71c02a9..ef8e5139a873 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3375,7 +3375,6 @@ static const struct proto_ops msg_ops = { .sendmsg = tipc_sendmsg, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct proto_ops packet_ops = { @@ -3396,7 +3395,6 @@ static const struct proto_ops packet_ops = { .sendmsg = tipc_send_packet, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct proto_ops stream_ops = { @@ -3417,7 +3415,6 @@ static const struct proto_ops stream_ops = { .sendmsg = tipc_sendstream, .recvmsg = tipc_recvstream, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct net_proto_family tipc_family_ops = { diff --git a/net/tls/tls.h b/net/tls/tls.h index d002c3af1966..86cef1c68e03 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -98,10 +98,6 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_sw_splice_eof(struct socket *sock); -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); @@ -117,8 +113,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int tls_tx_records(struct sock *sk, int flags); void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 975299d7213b..840ee06f1708 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -621,23 +621,6 @@ void tls_device_splice_eof(struct socket *sock) mutex_unlock(&tls_ctx->tx_lock); } -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - if (flags & MSG_OOB) - return -EOPNOTSUPP; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_device_sendmsg(sk, &msg, size); -} - struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7b9c83dd7de2..d5ed4d47b16e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -958,7 +958,6 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_SW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; ops[TLS_SW ][TLS_BASE].splice_eof = tls_sw_splice_eof; - ops[TLS_SW ][TLS_BASE].sendpage_locked = tls_sw_sendpage_locked; ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; @@ -970,17 +969,14 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; - ops[TLS_HW ][TLS_BASE].sendpage_locked = NULL; ops[TLS_HW ][TLS_SW ] = ops[TLS_BASE][TLS_SW ]; - ops[TLS_HW ][TLS_SW ].sendpage_locked = NULL; ops[TLS_BASE][TLS_HW ] = ops[TLS_BASE][TLS_SW ]; ops[TLS_SW ][TLS_HW ] = ops[TLS_SW ][TLS_SW ]; ops[TLS_HW ][TLS_HW ] = ops[TLS_HW ][TLS_SW ]; - ops[TLS_HW ][TLS_HW ].sendpage_locked = NULL; #endif #ifdef CONFIG_TLS_TOE ops[TLS_HW_RECORD][TLS_HW_RECORD] = *base; @@ -1029,7 +1025,6 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; prot[TLS_SW][TLS_BASE].splice_eof = tls_sw_splice_eof; - prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; @@ -1045,12 +1040,10 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_BASE].splice_eof = tls_device_splice_eof; - prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].splice_eof = tls_device_splice_eof; - prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 319f61590d2c..9b3aa89a4292 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1281,41 +1281,6 @@ unlock: mutex_unlock(&tls_ctx->tx_lock); } -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | - MSG_NO_SHARED_FRAGS)) - return -EOPNOTSUPP; - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_sw_sendmsg_locked(sk, &msg, size); -} - -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) - return -EOPNOTSUPP; - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_sw_sendmsg(sk, &msg, size); -} - static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, bool released) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f9d196439b49..f2f234f0b92c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -758,8 +758,6 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); -static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, - size_t size, int flags); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); @@ -852,7 +850,6 @@ static const struct proto_ops unix_stream_ops = { .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, - .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, @@ -878,7 +875,6 @@ static const struct proto_ops unix_dgram_ops = { .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -902,7 +898,6 @@ static const struct proto_ops unix_seqpacket_ops = { .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -2294,20 +2289,6 @@ out_err: return sent ? : err; } -static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return unix_stream_sendmsg(socket, &msg, size); -} - static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index efb8a0937a13..020cf17ab7e4 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1306,7 +1306,6 @@ static const struct proto_ops vsock_dgram_ops = { .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .read_skb = vsock_read_skb, }; @@ -2234,7 +2233,6 @@ static const struct proto_ops vsock_stream_ops = { .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; @@ -2257,7 +2255,6 @@ static const struct proto_ops vsock_seqpacket_ops = { .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .read_skb = vsock_read_skb, }; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5c7ad301d742..0fb5143bec7a 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1757,7 +1757,6 @@ static const struct proto_ops x25_proto_ops = { .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct packet_type x25_packet_type __read_mostly = { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cc1e7f15fa73..5a8c0dd250af 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1389,7 +1389,6 @@ static const struct proto_ops xsk_proto_ops = { .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) -- cgit v1.2.3 From 9d797ee2dce1e3e243bcc18dad7728df72fd11a4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 26 Jun 2023 13:58:37 -0700 Subject: Revert "af_unix: Call scm_recv() only after scm_set_cred()." This reverts commit 3f5f118bb657f94641ea383c7c1b8c09a5d46ea2. Konrad reported that desktop environment below cannot be reached after commit 3f5f118bb657 ("af_unix: Call scm_recv() only after scm_set_cred().") - postmarketOS (Alpine Linux w/ musl 1.2.4) - busybox 1.36.1 - GNOME 44.1 - networkmanager 1.42.6 - openrc 0.47 Regarding to the warning of SO_PASSPIDFD, I'll post another patch to suppress it by skipping SCM_PIDFD if scm->pid == NULL in scm_pidfd_recv(). Reported-by: Konrad Dybcio Link: https://lore.kernel.org/netdev/8c7f9abd-4f84-7296-2788-1e130d6304a0@kernel.org/ Signed-off-by: Kuniyuki Iwashima Tested-by: Ido Schimmel Tested-by: Gal Pressman Link: https://lore.kernel.org/r/20230626205837.82086-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f2f234f0b92c..3953daa2e1d0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2807,7 +2807,7 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (state->msg && check_creds) + if (state->msg) scm_recv(sock, state->msg, &scm, flags); else scm_destroy(&scm); -- cgit v1.2.3 From a9c49cc2f5b578c4ffa0ee135aa552d06dec0e82 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 27 Jun 2023 10:43:14 -0700 Subject: net: scm: introduce and use scm_recv_unix helper Recently, our friends from bluetooth subsystem reported [1] that after commit 5e2ff6704a27 ("scm: add SO_PASSPIDFD and SCM_PIDFD") scm_recv() helper become unusable in kernel modules (because it uses unexported pidfd_prepare() API). We were aware of this issue and workarounded it in a hard way by commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX bool"). But recently a new functionality was added in the scope of commit 817efd3cad74 ("Bluetooth: hci_sock: Forward credentials to monitor") and after that bluetooth can't be compiled as a kernel module. After some discussion in [1] we decided to split scm_recv() into two helpers, one won't support SCM_PIDFD (used for unix sockets), and another one will be completely the same as it was before commit 5e2ff6704a27 ("scm: add SO_PASSPIDFD and SCM_PIDFD"). Link: https://lore.kernel.org/lkml/CAJqdLrpFcga4n7wxBhsFqPQiN8PKFVr6U10fKcJ9W7AcZn+o6Q@mail.gmail.com/ [1] Fixes: 5e2ff6704a27 ("scm: add SO_PASSPIDFD and SCM_PIDFD") Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230627174314.67688-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/scm.h | 35 +++++++++++++++++++++++++---------- net/unix/af_unix.c | 4 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/scm.h b/include/net/scm.h index d456fc41b8bf..c5bcdf65f55c 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -153,8 +153,8 @@ static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm fd_install(pidfd, pidfd_file); } -static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) +static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || @@ -162,7 +162,7 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, scm->fp || scm_has_secdata(sock)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); - return; + return false; } if (test_bit(SOCK_PASSCRED, &sock->flags)) { @@ -175,19 +175,34 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); + scm_passec(sock, msg, scm); - scm_destroy_cred(scm); + if (scm->fp) + scm_detach_fds(msg, scm); - scm_passec(sock, msg, scm); + return true; +} - if (!scm->fp) +static inline void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock, msg, scm, flags)) return; - - scm_detach_fds(msg, scm); + + scm_destroy_cred(scm); } +static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock, msg, scm, flags)) + return; + + if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} #endif /* __LINUX_NET_SCM_H */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3953daa2e1d0..123b35ddfd71 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2427,7 +2427,7 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, } err = (flags & MSG_TRUNC) ? skb->len - skip : size; - scm_recv(sock, msg, &scm, flags); + scm_recv_unix(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); @@ -2808,7 +2808,7 @@ unlock: mutex_unlock(&u->iolock); if (state->msg) - scm_recv(sock, state->msg, &scm, flags); + scm_recv_unix(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: -- cgit v1.2.3