From ccce324dabfe2143519daf50ed8b1ef1d0c542f7 Mon Sep 17 00:00:00 2001 From: David Morley Date: Tue, 9 May 2023 18:05:58 +0000 Subject: tcp: make the first N SYN RTO backoffs linear Currently the SYN RTO schedule follows an exponential backoff scheme, which can be unnecessarily conservative in cases where there are link failures. In such cases, it's better to aggressively try to retransmit packets, so it takes routers less time to find a repath with a working link. We chose a default value for this sysctl of 4, to follow the macOS and IOS backoff scheme of 1,1,1,1,1,2,4,8, ... MacOS and IOS have used this backoff schedule for over a decade, since before this 2009 IETF presentation discussed the behavior: https://www.ietf.org/proceedings/75/slides/tcpm-1.pdf This commit makes the SYN RTO schedule start with a number of linear backoffs given by the following sysctl: * tcp_syn_linear_timeouts This changes the SYN RTO scheme to be: init_rto_val for tcp_syn_linear_timeouts, exp backoff starting at init_rto_val For example if init_rto_val = 1 and tcp_syn_linear_timeouts = 2, our backoff scheme would be: 1, 1, 1, 2, 4, 8, 16, ... Signed-off-by: David Morley Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Tested-by: David Morley Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230509180558.2541885-1-morleyd.kernel@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 39bda2b1066e..db24ed8f8509 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3275,6 +3275,7 @@ static int __net_init tcp_sk_init(struct net *net) else net->ipv4.tcp_congestion_control = &tcp_reno; + net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; return 0; } -- cgit v1.2.3 From 9378096e8a656fb5c4099b26b1370c56f056eab9 Mon Sep 17 00:00:00 2001 From: Aditi Ghag Date: Fri, 19 May 2023 22:51:49 +0000 Subject: bpf: tcp: Avoid taking fast sock lock in iterator This is a preparatory commit to replace `lock_sock_fast` with `lock_sock`,and facilitate BPF programs executed from the TCP sockets iterator to be able to destroy TCP sockets using the bpf_sock_destroy kfunc (implemented in follow-up commits). Previously, BPF TCP iterator was acquiring the sock lock with BH disabled. This led to scenarios where the sockets hash table bucket lock can be acquired with BH enabled in some path versus disabled in other. In such situation, kernel issued a warning since it thinks that in the BH enabled path the same bucket lock *might* be acquired again in the softirq context (BH disabled), which will lead to a potential dead lock. Since bpf_sock_destroy also happens in a process context, the potential deadlock warning is likely a false alarm. Here is a snippet of annotated stack trace that motivated this change: ``` Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&h->lhash2[i].lock); local_bh_disable(); lock(&h->lhash2[i].lock); kernel imagined possible scenario: local_bh_disable(); /* Possible softirq */ lock(&h->lhash2[i].lock); *** Potential Deadlock *** process context: lock_acquire+0xcd/0x330 _raw_spin_lock+0x33/0x40 ------> Acquire (bucket) lhash2.lock with BH enabled __inet_hash+0x4b/0x210 inet_csk_listen_start+0xe6/0x100 inet_listen+0x95/0x1d0 __sys_listen+0x69/0xb0 __x64_sys_listen+0x14/0x20 do_syscall_64+0x3c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc bpf_sock_destroy run from iterator: lock_acquire+0xcd/0x330 _raw_spin_lock+0x33/0x40 ------> Acquire (bucket) lhash2.lock with BH disabled inet_unhash+0x9a/0x110 tcp_set_state+0x6a/0x210 tcp_abort+0x10d/0x200 bpf_prog_6793c5ca50c43c0d_iter_tcp6_server+0xa4/0xa9 bpf_iter_run_prog+0x1ff/0x340 ------> lock_sock_fast that acquires sock lock with BH disabled bpf_iter_tcp_seq_show+0xca/0x190 bpf_seq_read+0x177/0x450 ``` Also, Yonghong reported a deadlock for non-listening TCP sockets that this change resolves. Previously, `lock_sock_fast` held the sock spin lock with BH which was again being acquired in `tcp_abort`: ``` watchdog: BUG: soft lockup - CPU#0 stuck for 86s! [test_progs:2331] RIP: 0010:queued_spin_lock_slowpath+0xd8/0x500 Call Trace: _raw_spin_lock+0x84/0x90 tcp_abort+0x13c/0x1f0 bpf_prog_88539c5453a9dd47_iter_tcp6_client+0x82/0x89 bpf_iter_run_prog+0x1aa/0x2c0 ? preempt_count_sub+0x1c/0xd0 ? from_kuid_munged+0x1c8/0x210 bpf_iter_tcp_seq_show+0x14e/0x1b0 bpf_seq_read+0x36c/0x6a0 bpf_iter_tcp_seq_show lock_sock_fast __lock_sock_fast spin_lock_bh(&sk->sk_lock.slock); /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held.* */ ... tcp_abort local_bh_disable(); spin_lock(&((sk)->sk_lock.slock)); // from bh_lock_sock(sk) ``` With the switch to `lock_sock`, it calls `spin_unlock_bh` before returning: ``` lock_sock lock_sock_nested spin_lock_bh(&sk->sk_lock.slock); : spin_unlock_bh(&sk->sk_lock.slock); ``` Acked-by: Yonghong Song Acked-by: Stanislav Fomichev Signed-off-by: Aditi Ghag Link: https://lore.kernel.org/r/20230519225157.760788-2-aditi.ghag@isovalent.com Signed-off-by: Martin KaFai Lau --- net/ipv4/tcp_ipv4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index db24ed8f8509..6e945199709b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2962,7 +2962,6 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; - bool slow; uid_t uid; int ret; @@ -2970,7 +2969,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) return 0; if (sk_fullsock(sk)) - slow = lock_sock_fast(sk); + lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; @@ -2994,7 +2993,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) unlock: if (sk_fullsock(sk)) - unlock_sock_fast(sk, slow); + release_sock(sk); return ret; } -- cgit v1.2.3 From 4ddbcb886268af8d12a23e6640b39d1d9c652b1b Mon Sep 17 00:00:00 2001 From: Aditi Ghag Date: Fri, 19 May 2023 22:51:55 +0000 Subject: bpf: Add bpf_sock_destroy kfunc The socket destroy kfunc is used to forcefully terminate sockets from certain BPF contexts. We plan to use the capability in Cilium load-balancing to terminate client sockets that continue to connect to deleted backends. The other use case is on-the-fly policy enforcement where existing socket connections prevented by policies need to be forcefully terminated. The kfunc also allows terminating sockets that may or may not be actively sending traffic. The kfunc can currently be called only from BPF TCP and UDP iterators where users can filter, and terminate selected sockets. More specifically, it can only be called from BPF contexts that ensure socket locking in order to allow synchronous execution of protocol specific `diag_destroy` handlers. The previous commit that batches UDP sockets during iteration facilitated a synchronous invocation of the UDP destroy callback from BPF context by skipping socket locks in `udp_abort`. TCP iterator already supported batching of sockets being iterated. To that end, `tracing_iter_filter` callback filter is added so that verifier can restrict the kfunc to programs with `BPF_TRACE_ITER` attach type, and reject other programs. The kfunc takes `sock_common` type argument, even though it expects, and casts them to a `sock` pointer. This enables the verifier to allow the sock_destroy kfunc to be called for TCP with `sock_common` and UDP with `sock` structs. Furthermore, as `sock_common` only has a subset of certain fields of `sock`, casting pointer to the latter type might not always be safe for certain sockets like request sockets, but these have a special handling in the diag_destroy handlers. Additionally, the kfunc is defined with `KF_TRUSTED_ARGS` flag to avoid the cases where a `PTR_TO_BTF_ID` sk is obtained by following another pointer. eg. getting a sk pointer (may be even NULL) by following another sk pointer. The pointer socket argument passed in TCP and UDP iterators is tagged as `PTR_TRUSTED` in {tcp,udp}_reg_info. The TRUSTED arg changes are contributed by Martin KaFai Lau . Signed-off-by: Aditi Ghag Link: https://lore.kernel.org/r/20230519225157.760788-8-aditi.ghag@isovalent.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 9 +++++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 8 ++++--- 4 files changed, 75 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/core/filter.c b/net/core/filter.c index 451b0ec7f242..968139f4a1ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11723,3 +11723,66 @@ static int __init bpf_kfunc_init(void) return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); } late_initcall(bpf_kfunc_init); + +/* Disables missing prototype warnings */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. + * + * The function expects a non-NULL pointer to a socket, and invokes the + * protocol specific socket destroy handlers. + * + * The helper can only be called from BPF contexts that have acquired the socket + * locks. + * + * Parameters: + * @sock: Pointer to socket to be destroyed + * + * Return: + * On error, may return EPROTONOSUPPORT, EINVAL. + * EPROTONOSUPPORT if protocol specific destroy handler is not supported. + * 0 otherwise + */ +__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) +{ + struct sock *sk = (struct sock *)sock; + + /* The locking semantics that allow for synchronous execution of the + * destroy handlers are only supported for TCP and UDP. + * Supporting protocols will need to acquire sock lock in the BPF context + * prior to invoking this kfunc. + */ + if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && + sk->sk_protocol != IPPROTO_UDP)) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, ECONNABORTED); +} + +__diag_pop() + +BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_sk_iter_kfunc_ids) + +static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && + prog->expected_attach_type != BPF_TRACE_ITER) + return -EACCES; + return 0; +} + +static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_iter_kfunc_ids, + .filter = tracing_iter_filter, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d6392c16b7a..87a04c3d9425 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4682,8 +4682,10 @@ int tcp_abort(struct sock *sk, int err) return 0; } - /* Don't race with userspace socket closes such as tcp_close. */ - lock_sock(sk); + /* BPF context ensures sock locking. */ + if (!has_current_bpf_ctx()) + /* Don't race with userspace socket closes such as tcp_close. */ + lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -4707,7 +4709,8 @@ int tcp_abort(struct sock *sk, int err) bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); - release_sock(sk); + if (!has_current_bpf_ctx()) + release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6e945199709b..6c2a5f1c63c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3355,7 +3355,7 @@ static struct bpf_iter_reg tcp_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__tcp, sk_common), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0c999aa5ab30..6893fb867529 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2930,7 +2930,8 @@ EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { - lock_sock(sk); + if (!has_current_bpf_ctx()) + lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() @@ -2943,7 +2944,8 @@ int udp_abort(struct sock *sk, int err) __udp_disconnect(sk, 0); out: - release_sock(sk); + if (!has_current_bpf_ctx()) + release_sock(sk); return 0; } @@ -3646,7 +3648,7 @@ static struct bpf_iter_reg udp_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; -- cgit v1.2.3 From c0a8966e2bc7d31f77a7246947ebc09c1ff06066 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 23 May 2023 18:14:52 +0200 Subject: net: ipv4: use consistent txhash in TIME_WAIT and SYN_RECV When using IPv4/TCP, skb->hash comes from sk->sk_txhash except in TIME_WAIT and SYN_RECV where it's not set in the reply skb from ip_send_unicast_reply. Those packets will have a mismatched hash with others from the same flow as their hashes will be 0. IPv6 does not have the same issue as the hash is set from the socket txhash in those cases. This commits sets the hash in the reply skb from ip_send_unicast_reply, which makes the IPv4 code behaving like IPv6. Signed-off-by: Antoine Tenart Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/ip.h | 2 +- net/ipv4/ip_output.c | 4 +++- net/ipv4/tcp_ipv4.c | 14 +++++++++----- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/ip.h b/include/net/ip.h index 8a3860a916dc..2982dd13cf16 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -286,7 +286,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time); + unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 553c740a6bfb..244fb9365d87 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1570,7 +1570,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, - unsigned int len, u64 transmit_time) + unsigned int len, u64 transmit_time, u32 txhash) { struct ip_options_data replyopts; struct ipcm_cookie ipc; @@ -1633,6 +1633,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; nskb->mono_delivery_time = !!transmit_time; + if (txhash) + skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index af043f063a73..a50bd782f91f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -692,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) u64 transmit_time = 0; struct sock *ctl_sk; struct net *net; + u32 txhash = 0; /* Never send a reset in response to a reset. */ if (th->rst) @@ -829,6 +830,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); xfrm_sk_clone_policy(ctl_sk, sk); + txhash = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_txhash : sk->sk_txhash; } else { ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; @@ -837,7 +840,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); @@ -859,7 +862,7 @@ static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, - int reply_flags, u8 tos) + int reply_flags, u8 tos, u32 txhash) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -935,7 +938,7 @@ static void tcp_v4_send_ack(const struct sock *sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, - transmit_time); + transmit_time, txhash); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -955,7 +958,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos + tw->tw_tos, + tw->tw_txhash ); inet_twsk_put(tw); @@ -988,7 +992,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos); + ip_hdr(skb)->tos, tcp_rsk(req)->txhash); } /* -- cgit v1.2.3 From 1d7e4538a5463faa0b0e26a7a7b6bd68c7dfdd78 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:13 +0100 Subject: ipv4, ipv6: Use splice_eof() to flush Allow splice to undo the effects of MSG_MORE after prematurely ending a splice/sendfile due to getting an EOF condition (->splice_read() returned 0) after splice had called sendmsg() with MSG_MORE set when the user didn't set MSG_MORE. For UDP, a pending packet will not be emitted if the socket is closed before it is flushed; with this change, it be flushed by ->splice_eof(). For TCP, it's not clear that MSG_MORE is actually effective. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ Signed-off-by: David Howells cc: Kuniyuki Iwashima cc: Willem de Bruijn cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/net/inet_common.h | 1 + include/net/tcp.h | 1 + include/net/udp.h | 1 + net/ipv4/af_inet.c | 18 ++++++++++++++++++ net/ipv4/tcp.c | 16 ++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/udp.c | 16 ++++++++++++++++ net/ipv6/af_inet6.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 15 +++++++++++++++ 10 files changed, 71 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 77f4b0ef5b92..a75333342c4e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +void inet_splice_eof(struct socket *sock); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/include/net/tcp.h b/include/net/tcp.h index 68990a8f556a..49611af31bb7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); +void tcp_splice_eof(struct socket *sock); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/include/net/udp.h b/include/net/udp.h index 5cad44318d71..4ed0b47c5582 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b5735b3551cf..fd233c4195ac 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(inet_sendmsg); +void inet_splice_eof(struct socket *sock) +{ + const struct proto *prot; + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->splice_eof) + prot->splice_eof(sock); +} +EXPORT_SYMBOL_GPL(inet_splice_eof); + ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, @@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = { .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT @@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53b7751b68e1..09f03221a6f1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(tcp_sendmsg); +void tcp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; + + if (!tcp_write_queue_tail(sk)) + return; + + lock_sock(sk); + mss_now = tcp_send_mss(sk, &size_goal, 0); + tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_splice_eof); + /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 53e9ce2f05bb..84a5d557dc1a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3116,6 +3116,7 @@ struct proto tcp_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fd3dae081f3a..df5e407286d7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1324,6 +1324,21 @@ do_confirm: } EXPORT_SYMBOL(udp_sendmsg); +void udp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!up->pending || READ_ONCE(up->corkflag)) + return; + + lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_push_pending_frames(sk); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(udp_splice_eof); + int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -2918,6 +2933,7 @@ struct proto udp_prot = { .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, + .splice_eof = udp_splice_eof, .sendpage = udp_sendpage, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2bbf13216a3d..564942bee067 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d657713d1c71..c17c8ff94b79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e5a337e6b970..317b01c9bc39 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1653,6 +1653,20 @@ do_confirm: } EXPORT_SYMBOL(udpv6_sendmsg); +static void udpv6_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!up->pending || READ_ONCE(up->corkflag)) + return; + + lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_v6_push_pending_frames(sk); + release_sock(sk); +} + void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -1764,6 +1778,7 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, + .splice_eof = udpv6_splice_eof, .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, -- cgit v1.2.3 From b650d953cd391595e536153ce30b4aab385643ac Mon Sep 17 00:00:00 2001 From: "mfreemon@cloudflare.com" Date: Sun, 11 Jun 2023 22:05:24 -0500 Subject: tcp: enforce receive buffer memory limits by allowing the tcp window to shrink Under certain circumstances, the tcp receive buffer memory limit set by autotuning (sk_rcvbuf) is increased due to incoming data packets as a result of the window not closing when it should be. This can result in the receive buffer growing all the way up to tcp_rmem[2], even for tcp sessions with a low BDP. To reproduce: Connect a TCP session with the receiver doing nothing and the sender sending small packets (an infinite loop of socket send() with 4 bytes of payload with a sleep of 1 ms in between each send()). This will cause the tcp receive buffer to grow all the way up to tcp_rmem[2]. As a result, a host can have individual tcp sessions with receive buffers of size tcp_rmem[2], and the host itself can reach tcp_mem limits, causing the host to go into tcp memory pressure mode. The fundamental issue is the relationship between the granularity of the window scaling factor and the number of byte ACKed back to the sender. This problem has previously been identified in RFC 7323, appendix F [1]. The Linux kernel currently adheres to never shrinking the window. In addition to the overallocation of memory mentioned above, the current behavior is functionally incorrect, because once tcp_rmem[2] is reached when no remediations remain (i.e. tcp collapse fails to free up any more memory and there are no packets to prune from the out-of-order queue), the receiver will drop in-window packets resulting in retransmissions and an eventual timeout of the tcp session. A receive buffer full condition should instead result in a zero window and an indefinite wait. In practice, this problem is largely hidden for most flows. It is not applicable to mice flows. Elephant flows can send data fast enough to "overrun" the sk_rcvbuf limit (in a single ACK), triggering a zero window. But this problem does show up for other types of flows. Examples are websockets and other type of flows that send small amounts of data spaced apart slightly in time. In these cases, we directly encounter the problem described in [1]. RFC 7323, section 2.4 [2], says there are instances when a retracted window can be offered, and that TCP implementations MUST ensure that they handle a shrinking window, as specified in RFC 1122, section 4.2.2.16 [3]. All prior RFCs on the topic of tcp window management have made clear that sender must accept a shrunk window from the receiver, including RFC 793 [4] and RFC 1323 [5]. This patch implements the functionality to shrink the tcp window when necessary to keep the right edge within the memory limit by autotuning (sk_rcvbuf). This new functionality is enabled with the new sysctl: net.ipv4.tcp_shrink_window Additional information can be found at: https://blog.cloudflare.com/unbounded-memory-usage-by-tcp-for-receive-buffers-and-how-we-fixed-it/ [1] https://www.rfc-editor.org/rfc/rfc7323#appendix-F [2] https://www.rfc-editor.org/rfc/rfc7323#section-2.4 [3] https://www.rfc-editor.org/rfc/rfc1122#page-91 [4] https://www.rfc-editor.org/rfc/rfc793 [5] https://www.rfc-editor.org/rfc/rfc1323 Signed-off-by: Mike Freemon Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 15 +++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv4/tcp_output.c | 60 +++++++++++++++++++++++++++++----- 5 files changed, 78 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 366e2a5097d9..4a010a7cde7f 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -981,6 +981,21 @@ tcp_tw_reuse - INTEGER tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. +tcp_shrink_window - BOOLEAN + This changes how the TCP receive window is calculated. + + RFC 7323, section 2.4, says there are instances when a retracted + window can be offered, and that TCP implementations MUST ensure + that they handle a shrinking window, as specified in RFC 1122. + + - 0 - Disabled. The window is never shrunk. + - 1 - Enabled. The window is shrunk when necessary to remain within + the memory limit set by autotuning (sk_rcvbuf). + This only occurs if a non-zero receive window + scaling factor is also in effect. + + Default: 0 + tcp_wmem - vector of 3 INTEGERs: min, default, max min: Amount of memory reserved for send buffers for TCP sockets. Each TCP socket has rights to use it due to fact of its birth. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a4efb7a2796c..f00374718159 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,6 +65,7 @@ struct netns_ipv4 { #endif bool fib_has_custom_local_routes; bool fib_offload_disabled; + u8 sysctl_tcp_shrink_window; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_t fib_num_tclassid_users; #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 356afe54951c..2afb0870648b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1480,6 +1480,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_syn_linear_timeouts_max, }, + { + .procname = "tcp_shrink_window", + .data = &init_net.ipv4.sysctl_tcp_shrink_window, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84a5d557dc1a..9213804b034f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3281,6 +3281,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_congestion_control = &tcp_reno; net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; + net->ipv4.sysctl_tcp_shrink_window = 0; + return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 660eac4bf2a7..2cb39b6dad02 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk) u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); + struct net *net = sock_net(sk); - /* Never shrink the offered window */ if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else @@ -270,11 +270,14 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - if (new_win == 0) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPWANTZEROWINDOWADV); - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { + /* Never shrink the offered window */ + if (new_win == 0) + NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -282,7 +285,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) + READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -294,10 +297,9 @@ static u16 tcp_select_window(struct sock *sk) if (new_win == 0) { tp->pred_flags = 0; if (old_win) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTOZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; @@ -2987,6 +2989,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct @@ -3008,6 +3011,15 @@ u32 __tcp_select_window(struct sock *sk) if (mss <= 0) return 0; } + + /* Only allow window shrink if the sysctl is enabled and we have + * a non-zero scaling factor in effect. + */ + if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) + goto shrink_window_allowed; + + /* do not allow window to shrink */ + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; @@ -3062,6 +3074,36 @@ u32 __tcp_select_window(struct sock *sk) } return window; + +shrink_window_allowed: + /* new window should always be an exact multiple of scaling factor */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_under_memory_pressure(sk)) + tcp_adjust_rcv_ssthresh(sk); + + /* if free space is too low, return a zero window */ + if (free_space < (allowed_space >> 4) || free_space < mss || + free_space < (1 << tp->rx_opt.rcv_wscale)) + return 0; + } + + if (free_space > tp->rcv_ssthresh) { + free_space = tp->rcv_ssthresh; + /* new window should always be an exact multiple of scaling factor + * + * For this case, we ALIGN "up" (increase free_space) because + * we know free_space is not zero here, it has been reduced from + * the memory-based limit, and rcv_ssthresh is not a hard limit + * (unlike sk_rcvbuf). + */ + free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); + } + + return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, -- cgit v1.2.3 From dc97391e661009eab46783030d2404c9b6e6f2e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2023 23:55:12 +0100 Subject: sock: Remove ->sendpage*() in favour of sendmsg(MSG_SPLICE_PAGES) Remove ->sendpage() and ->sendpage_locked(). sendmsg() with MSG_SPLICE_PAGES should be used instead. This allows multiple pages and multipage folios to be passed through. Signed-off-by: David Howells Acked-by: Marc Kleine-Budde # for net/can cc: Jens Axboe cc: Matthew Wilcox cc: linux-afs@lists.infradead.org cc: mptcp@lists.linux.dev cc: rds-devel@oss.oracle.com cc: tipc-discussion@lists.sourceforge.net cc: virtualization@lists.linux-foundation.org Link: https://lore.kernel.org/r/20230623225513.2732256-16-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/bpf/map_sockmap.rst | 10 ++--- Documentation/filesystems/locking.rst | 2 - Documentation/filesystems/vfs.rst | 1 - Documentation/networking/scaling.rst | 4 +- crypto/af_alg.c | 28 ------------- crypto/algif_aead.c | 22 ++-------- crypto/algif_rng.c | 2 - crypto/algif_skcipher.c | 14 ------- .../ethernet/chelsio/inline_crypto/chtls/chtls.h | 2 - .../chelsio/inline_crypto/chtls/chtls_io.c | 14 ------- .../chelsio/inline_crypto/chtls/chtls_main.c | 1 - fs/nfsd/vfs.c | 2 +- include/crypto/if_alg.h | 2 - include/linux/net.h | 8 ---- include/net/inet_common.h | 2 - include/net/sock.h | 6 --- include/net/tcp.h | 4 -- net/appletalk/ddp.c | 1 - net/atm/pvc.c | 1 - net/atm/svc.c | 1 - net/ax25/af_ax25.c | 1 - net/caif/caif_socket.c | 2 - net/can/bcm.c | 1 - net/can/isotp.c | 1 - net/can/j1939/socket.c | 1 - net/can/raw.c | 1 - net/core/sock.c | 35 +--------------- net/dccp/ipv4.c | 1 - net/dccp/ipv6.c | 1 - net/ieee802154/socket.c | 2 - net/ipv4/af_inet.c | 21 ---------- net/ipv4/tcp.c | 43 ++----------------- net/ipv4/tcp_bpf.c | 23 +---------- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/udp.c | 15 ------- net/ipv4/udp_impl.h | 2 - net/ipv4/udplite.c | 1 - net/ipv6/af_inet6.c | 3 -- net/ipv6/raw.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/kcm/kcmsock.c | 20 --------- net/key/af_key.c | 1 - net/l2tp/l2tp_ip.c | 1 - net/l2tp/l2tp_ip6.c | 1 - net/llc/af_llc.c | 1 - net/mctp/af_mctp.c | 1 - net/mptcp/protocol.c | 2 - net/netlink/af_netlink.c | 1 - net/netrom/af_netrom.c | 1 - net/packet/af_packet.c | 2 - net/phonet/socket.c | 2 - net/qrtr/af_qrtr.c | 1 - net/rds/af_rds.c | 1 - net/rose/af_rose.c | 1 - net/rxrpc/af_rxrpc.c | 1 - net/sctp/protocol.c | 1 - net/socket.c | 48 ---------------------- net/tipc/socket.c | 3 -- net/tls/tls.h | 6 --- net/tls/tls_device.c | 17 -------- net/tls/tls_main.c | 7 ---- net/tls/tls_sw.c | 35 ---------------- net/unix/af_unix.c | 19 --------- net/vmw_vsock/af_vsock.c | 3 -- net/x25/af_x25.c | 1 - net/xdp/xsk.c | 1 - 66 files changed, 20 insertions(+), 442 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/bpf/map_sockmap.rst b/Documentation/bpf/map_sockmap.rst index cc92047c6630..2d630686a00b 100644 --- a/Documentation/bpf/map_sockmap.rst +++ b/Documentation/bpf/map_sockmap.rst @@ -240,11 +240,11 @@ offsets into ``msg``, respectively. If a program of type ``BPF_PROG_TYPE_SK_MSG`` is run on a ``msg`` it can only parse data that the (``data``, ``data_end``) pointers have already consumed. For ``sendmsg()`` hooks this is likely the first scatterlist element. But for -calls relying on the ``sendpage`` handler (e.g., ``sendfile()``) this will be -the range (**0**, **0**) because the data is shared with user space and by -default the objective is to avoid allowing user space to modify data while (or -after) BPF verdict is being decided. This helper can be used to pull in data -and to set the start and end pointers to given values. Data will be copied if +calls relying on MSG_SPLICE_PAGES (e.g., ``sendfile()``) this will be the +range (**0**, **0**) because the data is shared with user space and by default +the objective is to avoid allowing user space to modify data while (or after) +BPF verdict is being decided. This helper can be used to pull in data and to +set the start and end pointers to given values. Data will be copied if necessary (i.e., if data was not linear and if start and end pointers do not point to the same chunk). diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa1a233b0fa8..ed148919e11a 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -521,8 +521,6 @@ prototypes:: int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, - loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 769be5230210..cb2a97e49872 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1086,7 +1086,6 @@ This describes how the VFS can manipulate an open file. As of kernel int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 3d435caa3ef2..92c9fb46d6a2 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -269,8 +269,8 @@ a single application thread handles flows with many different flow hashes. rps_sock_flow_table is a global flow table that contains the *desired* CPU for flows: the CPU that is currently processing the flow in userspace. Each table value is a CPU index that is updated during calls to recvmsg -and sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage() -and tcp_splice_read()). +and sendmsg (specifically, inet_recvmsg(), inet_sendmsg() and +tcp_splice_read()). When the scheduler moves a thread to a new CPU while it has outstanding receive packets on the old CPU, packets may arrive out of order. To diff --git a/crypto/af_alg.c b/crypto/af_alg.c index cdb1dcc5dd1a..6218c773d71c 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -482,7 +482,6 @@ static const struct proto_ops alg_proto_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, @@ -1106,33 +1105,6 @@ unlock: } EXPORT_SYMBOL_GPL(af_alg_sendmsg); -/** - * af_alg_sendpage - sendpage system call handler - * @sock: socket of connection to user space to write to - * @page: data to send - * @offset: offset into page to begin sending - * @size: length of data - * @flags: message send/receive flags - * - * This is a generic implementation of sendpage to fill ctx->tsgl_list. - */ -ssize_t af_alg_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { - .msg_flags = flags | MSG_SPLICE_PAGES, - }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return sock_sendmsg(sock, &msg); -} -EXPORT_SYMBOL_GPL(af_alg_sendpage); - /** * af_alg_free_resources - release resources required for crypto request * @areq: Request holding the TX and RX SGL diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 35bfa283748d..7d58cbbce4af 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -9,10 +9,10 @@ * The following concept of the memory management is used: * * The kernel maintains two SGLs, the TX SGL and the RX SGL. The TX SGL is - * filled by user space with the data submitted via sendpage. Filling up - * the TX SGL does not cause a crypto operation -- the data will only be - * tracked by the kernel. Upon receipt of one recvmsg call, the caller must - * provide a buffer which is tracked with the RX SGL. + * filled by user space with the data submitted via sendmsg (maybe with + * MSG_SPLICE_PAGES). Filling up the TX SGL does not cause a crypto operation + * -- the data will only be tracked by the kernel. Upon receipt of one recvmsg + * call, the caller must provide a buffer which is tracked with the RX SGL. * * During the processing of the recvmsg operation, the cipher request is * allocated and prepared. As part of the recvmsg operation, the processed @@ -370,7 +370,6 @@ static struct proto_ops algif_aead_ops = { .release = af_alg_release, .sendmsg = aead_sendmsg, - .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, .poll = af_alg_poll, }; @@ -422,18 +421,6 @@ static int aead_sendmsg_nokey(struct socket *sock, struct msghdr *msg, return aead_sendmsg(sock, msg, size); } -static ssize_t aead_sendpage_nokey(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - int err; - - err = aead_check_key(sock); - if (err) - return err; - - return af_alg_sendpage(sock, page, offset, size, flags); -} - static int aead_recvmsg_nokey(struct socket *sock, struct msghdr *msg, size_t ignored, int flags) { @@ -461,7 +448,6 @@ static struct proto_ops algif_aead_ops_nokey = { .release = af_alg_release, .sendmsg = aead_sendmsg_nokey, - .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, .poll = af_alg_poll, }; diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c index 407408c43730..10c41adac3b1 100644 --- a/crypto/algif_rng.c +++ b/crypto/algif_rng.c @@ -174,7 +174,6 @@ static struct proto_ops algif_rng_ops = { .bind = sock_no_bind, .accept = sock_no_accept, .sendmsg = sock_no_sendmsg, - .sendpage = sock_no_sendpage, .release = af_alg_release, .recvmsg = rng_recvmsg, @@ -192,7 +191,6 @@ static struct proto_ops __maybe_unused algif_rng_test_ops = { .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, - .sendpage = sock_no_sendpage, .release = af_alg_release, .recvmsg = rng_test_recvmsg, diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index b1f321b9f846..9ada9b741af8 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -194,7 +194,6 @@ static struct proto_ops algif_skcipher_ops = { .release = af_alg_release, .sendmsg = skcipher_sendmsg, - .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, .poll = af_alg_poll, }; @@ -246,18 +245,6 @@ static int skcipher_sendmsg_nokey(struct socket *sock, struct msghdr *msg, return skcipher_sendmsg(sock, msg, size); } -static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - int err; - - err = skcipher_check_key(sock); - if (err) - return err; - - return af_alg_sendpage(sock, page, offset, size, flags); -} - static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg, size_t ignored, int flags) { @@ -285,7 +272,6 @@ static struct proto_ops algif_skcipher_ops_nokey = { .release = af_alg_release, .sendmsg = skcipher_sendmsg_nokey, - .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, .poll = af_alg_poll, }; diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h index da4818d2c856..68562a82d036 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h @@ -569,8 +569,6 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void chtls_splice_eof(struct socket *sock); -int chtls_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int send_tx_flowc_wr(struct sock *sk, int compl, u32 snd_nxt, u32 rcv_nxt); void chtls_tcp_push(struct sock *sk, int flags); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index e08ac960c967..5fc64e47568a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -1246,20 +1246,6 @@ void chtls_splice_eof(struct socket *sock) release_sock(sk); } -int chtls_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - struct bio_vec bvec; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return chtls_sendmsg(sk, &msg, size); -} - static void chtls_select_window(struct sock *sk) { struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 6b6787eafd2f..455a54708be4 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -607,7 +607,6 @@ static void __init chtls_init_ulp_ops(void) chtls_cpl_prot.shutdown = chtls_shutdown; chtls_cpl_prot.sendmsg = chtls_sendmsg; chtls_cpl_prot.splice_eof = chtls_splice_eof; - chtls_cpl_prot.sendpage = chtls_sendpage; chtls_cpl_prot.recvmsg = chtls_recvmsg; chtls_cpl_prot.setsockopt = chtls_setsockopt; chtls_cpl_prot.getsockopt = chtls_getsockopt; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index db67f8e19344..8879e207ff5a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -936,7 +936,7 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, /* * Grab and keep cached pages associated with a file in the svc_rqst - * so that they can be passed to the network sendmsg/sendpage routines + * so that they can be passed to the network sendmsg routines * directly. They will be released after the sending has completed. * * Return values: Number of bytes consumed, or -EIO if there are no diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 34224e77f5a2..ef8ce86b1f78 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -229,8 +229,6 @@ void af_alg_wmem_wakeup(struct sock *sk); int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min); int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize); -ssize_t af_alg_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(void *data, int err); __poll_t af_alg_poll(struct file *file, struct socket *sock, diff --git a/include/linux/net.h b/include/linux/net.h index 23324e9a2b3d..41c608c1b02c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -207,8 +207,6 @@ struct proto_ops { size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); - ssize_t (*sendpage) (struct socket *sock, struct page *page, - int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void (*splice_eof)(struct socket *sock); @@ -222,8 +220,6 @@ struct proto_ops { sk_read_actor_t recv_actor); /* This is different from read_sock(), it reads an entire skb at a time. */ int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); - int (*sendpage_locked)(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); @@ -341,10 +337,6 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); -int kernel_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); -int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); /* Routine returns the IP overhead imposed by a (caller-protected) socket. */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index a75333342c4e..b86b8e21de7f 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -36,8 +36,6 @@ void __inet_accept(struct socket *sock, struct socket *newsock, int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); void inet_splice_eof(struct socket *sock); -ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); diff --git a/include/net/sock.h b/include/net/sock.h index 62a1b99da349..121284f455a8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1277,8 +1277,6 @@ struct proto { size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); - int (*sendpage)(struct sock *sk, struct page *page, - int offset, size_t size, int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); @@ -1919,10 +1917,6 @@ int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags); -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); /* * Functions to fill in entries in struct proto_ops when a protocol diff --git a/include/net/tcp.h b/include/net/tcp.h index 31b534370787..226bce6d1e8c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -329,10 +329,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); void tcp_splice_eof(struct socket *sock); -int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); -int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index a06f4d4a6f47..8978fb6212ff 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1929,7 +1929,6 @@ static const struct proto_ops atalk_dgram_ops = { .sendmsg = atalk_sendmsg, .recvmsg = atalk_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block ddp_notifier = { diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 53e7d3f39e26..66d9a9bd5896 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -126,7 +126,6 @@ static const struct proto_ops pvc_proto_ops = { .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; diff --git a/net/atm/svc.c b/net/atm/svc.c index d83556d8beb9..36a814f1fbd1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -654,7 +654,6 @@ static const struct proto_ops svc_proto_ops = { .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d8da400cb4de..5db805d5f74d 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -2022,7 +2022,6 @@ static const struct proto_ops ax25_proto_ops = { .sendmsg = ax25_sendmsg, .recvmsg = ax25_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4eebcc66c19a..9c82698da4f5 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -976,7 +976,6 @@ static const struct proto_ops caif_seqpacket_ops = { .sendmsg = caif_seqpkt_sendmsg, .recvmsg = caif_seqpkt_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct proto_ops caif_stream_ops = { @@ -996,7 +995,6 @@ static const struct proto_ops caif_stream_ops = { .sendmsg = caif_stream_sendmsg, .recvmsg = caif_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* This function is called when a socket is finally destroyed. */ diff --git a/net/can/bcm.c b/net/can/bcm.c index a962ec2b8ba5..9ba35685b043 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1703,7 +1703,6 @@ static const struct proto_ops bcm_ops = { .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto bcm_proto __read_mostly = { diff --git a/net/can/isotp.c b/net/can/isotp.c index 84f9aba02901..1f25b45868cf 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1699,7 +1699,6 @@ static const struct proto_ops isotp_ops = { .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto isotp_proto __read_mostly = { diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 35970c25496a..feaec4ad6d16 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1306,7 +1306,6 @@ static const struct proto_ops j1939_ops = { .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto j1939_proto __read_mostly = { diff --git a/net/can/raw.c b/net/can/raw.c index f64469b98260..15c79b079184 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -962,7 +962,6 @@ static const struct proto_ops raw_ops = { .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto raw_proto __read_mostly = { diff --git a/net/core/sock.c b/net/core/sock.c index 5f1747c12004..de719094b804 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3261,36 +3261,6 @@ void __receive_sock(struct file *file) } } -ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg(sock, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage); - -ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - ssize_t res; - struct msghdr msg = {.msg_flags = flags}; - struct kvec iov; - char *kaddr = kmap(page); - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); - kunmap(page); - return res; -} -EXPORT_SYMBOL(sock_no_sendpage_locked); - /* * Default Socket Callbacks */ @@ -4046,7 +4016,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " - "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), @@ -4067,7 +4037,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) proto_method_implemented(proto->getsockopt), proto_method_implemented(proto->sendmsg), proto_method_implemented(proto->recvmsg), - proto_method_implemented(proto->sendpage), proto_method_implemented(proto->bind), proto_method_implemented(proto->backlog_rcv), proto_method_implemented(proto->hash), @@ -4088,7 +4057,7 @@ static int proto_seq_show(struct seq_file *seq, void *v) "maxhdr", "slab", "module", - "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); else proto_seq_printf(seq, list_entry(v, struct proto, node)); return 0; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3ab68415d121..fa8079303cb0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1010,7 +1010,6 @@ static const struct proto_ops inet_dccp_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct inet_protosw dccp_v4_protosw = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 93c98990d726..7249ef218178 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1087,7 +1087,6 @@ static const struct proto_ops inet6_dccp_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 9c124705120d..00302e8b9615 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -426,7 +426,6 @@ static const struct proto_ops ieee802154_raw_ops = { .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* DGRAM Sockets (802.15.4 dataframes) */ @@ -989,7 +988,6 @@ static const struct proto_ops ieee802154_dgram_ops = { .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static void ieee802154_sock_destruct(struct sock *sk) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 38e649fb4474..9b2ca2fcc5a1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -847,23 +847,6 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); -ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) -{ - struct sock *sk = sock->sk; - const struct proto *prot; - - if (unlikely(inet_send_prepare(sk))) - return -EAGAIN; - - /* IPV6_ADDRFORM can change sk->sk_prot under us. */ - prot = READ_ONCE(sk->sk_prot); - if (prot->sendpage) - return prot->sendpage(sk, page, offset, size, flags); - return sock_no_sendpage(sock, page, offset, size, flags); -} -EXPORT_SYMBOL(inet_sendpage); - INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, @@ -1067,12 +1050,10 @@ const struct proto_ops inet_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, - .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, @@ -1102,7 +1083,6 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, @@ -1134,7 +1114,6 @@ static const struct proto_ops inet_sockraw_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d56edc2c885f..e03e08745308 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -923,11 +923,10 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -/* In some cases, both sendpage() and sendmsg() could have added - * an skb to the write queue, but failed adding payload on it. - * We need to remove it to consume less memory, but more - * importantly be able to generate EPOLLOUT for Edge Trigger epoll() - * users. +/* In some cases, both sendmsg() could have added an skb to the write queue, + * but failed adding payload on it. We need to remove it to consume less + * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger + * epoll() users. */ void tcp_remove_empty_skb(struct sock *sk) { @@ -975,40 +974,6 @@ int tcp_wmem_schedule(struct sock *sk, int copy) return min(copy, sk->sk_forward_alloc); } -int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (!(sk->sk_route_caps & NETIF_F_SG)) - return sock_no_sendpage_locked(sk, page, offset, size, flags); - - tcp_rate_check_app_limited(sk); /* is sending application-limited? */ - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - return tcp_sendmsg_locked(sk, &msg, size); -} -EXPORT_SYMBOL_GPL(tcp_sendpage_locked); - -int tcp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - int ret; - - lock_sock(sk); - ret = tcp_sendpage_locked(sk, page, offset, size, flags); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL(tcp_sendpage); - void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 31d6005cea9b..81f0dff69e0b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -486,7 +486,7 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) long timeo; int flags; - /* Don't let internal sendpage flags through */ + /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; @@ -566,23 +566,6 @@ out_err: return copied ? copied : err; } -static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { - .msg_flags = flags | MSG_SPLICE_PAGES, - }; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - return tcp_bpf_sendmsg(sk, &msg, size); -} - enum { TCP_BPF_IPV4, TCP_BPF_IPV6, @@ -612,7 +595,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; - prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; @@ -647,8 +629,7 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && - ops->sendmsg == tcp_sendmsg && - ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; + ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9213804b034f..fd365de4d5ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3117,7 +3117,6 @@ struct proto tcp_prot = { .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, - .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 48fdcd3cad9c..42a96b3547c9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1340,20 +1340,6 @@ void udp_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(udp_splice_eof); -int udp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return udp_sendmsg(sk, &msg, size); -} - #define UDP_SKB_IS_STATELESS 0x80000000 /* all head states (dst, sk, nf conntrack) except skb extensions are @@ -2933,7 +2919,6 @@ struct proto udp_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .splice_eof = udp_splice_eof, - .sendpage = udp_sendpage, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 4ba7a88a1b1d..e1ff3a375996 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -19,8 +19,6 @@ int udp_getsockopt(struct sock *sk, int level, int optname, int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); -int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, - int flags); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 143f93a12f25..39ecdad1b50c 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -56,7 +56,6 @@ struct proto udplite_prot = { .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, - .sendpage = udp_sendpage, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b3451cf47d29..5d593ddc0347 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -696,9 +696,7 @@ const struct proto_ops inet6_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, - .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, @@ -729,7 +727,6 @@ const struct proto_ops inet6_dgram_ops = { .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_skb = udp_read_skb, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index c9caeb5a43ed..ac1cef094c5f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1296,7 +1296,6 @@ const struct proto_ops inet6_sockraw_ops = { .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c17c8ff94b79..40dd92a2f480 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2151,7 +2151,6 @@ struct proto tcpv6_prot = { .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, - .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d0537c1c8cd7..393f01b2a7e6 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -963,24 +963,6 @@ static void kcm_splice_eof(struct socket *sock) release_sock(sk); } -static ssize_t kcm_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) - -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - if (flags & MSG_OOB) - return -EOPNOTSUPP; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return kcm_sendmsg(sock, &msg, size); -} - static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -1769,7 +1751,6 @@ static const struct proto_ops kcm_dgram_ops = { .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, - .sendpage = kcm_sendpage, }; static const struct proto_ops kcm_seqpacket_ops = { @@ -1791,7 +1772,6 @@ static const struct proto_ops kcm_seqpacket_ops = { .recvmsg = kcm_recvmsg, .mmap = sock_no_mmap, .splice_eof = kcm_splice_eof, - .sendpage = kcm_sendpage, .splice_read = kcm_splice_read, }; diff --git a/net/key/af_key.c b/net/key/af_key.c index 31ab12fd720a..ede3c6a60353 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3761,7 +3761,6 @@ static const struct proto_ops pfkey_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, /* Now the operations that really occur. */ .release = pfkey_release, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 2b795c1064f5..f9073bc7281f 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -624,7 +624,6 @@ static const struct proto_ops l2tp_ip_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct inet_protosw l2tp_ip_protosw = { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 5137ea1861ce..b1623f9c4f92 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -751,7 +751,6 @@ static const struct proto_ops l2tp_ip6_ops = { .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 9ffbc667be6c..57c35c960b2c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1232,7 +1232,6 @@ static const struct proto_ops llc_ui_ops = { .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const char llc_proc_err_msg[] __initconst = diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index bb4bd0b6a4f7..f6be58b68c6f 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -485,7 +485,6 @@ static const struct proto_ops mctp_dgram_ops = { .sendmsg = mctp_sendmsg, .recvmsg = mctp_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = mctp_compat_ioctl, #endif diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bd023debedc8..e892673deb73 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3866,7 +3866,6 @@ static const struct proto_ops mptcp_stream_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = inet_sendpage, }; static struct inet_protosw mptcp_protosw = { @@ -3961,7 +3960,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, - .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index cbd9aa7ee24a..39cfb778ebc5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2815,7 +2815,6 @@ static const struct proto_ops netlink_ops = { .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct net_proto_family netlink_family_ops = { diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 5a4cb796150f..eb8ccbd58df7 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1364,7 +1364,6 @@ static const struct proto_ops nr_proto_ops = { .sendmsg = nr_sendmsg, .recvmsg = nr_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block nr_dev_notifier = { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a2dbeb264f26..85ff90a03b0c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4621,7 +4621,6 @@ static const struct proto_ops packet_ops_spkt = { .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct proto_ops packet_ops = { @@ -4643,7 +4642,6 @@ static const struct proto_ops packet_ops = { .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, - .sendpage = sock_no_sendpage, }; static const struct net_proto_family packet_family_ops = { diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 967f9b4dc026..1018340d89a7 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -441,7 +441,6 @@ const struct proto_ops phonet_dgram_ops = { .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; const struct proto_ops phonet_stream_ops = { @@ -462,7 +461,6 @@ const struct proto_ops phonet_stream_ops = { .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; EXPORT_SYMBOL(phonet_stream_ops); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 76f0434d3d06..78beb74146e7 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -1244,7 +1244,6 @@ static const struct proto_ops qrtr_proto_ops = { .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto qrtr_proto = { diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 3ff6995244e5..01c4cdfef45d 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -653,7 +653,6 @@ static const struct proto_ops rds_proto_ops = { .sendmsg = rds_sendmsg, .recvmsg = rds_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static void rds_sock_destruct(struct sock *sk) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ca2b17f32670..49dafe9ac72f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1496,7 +1496,6 @@ static const struct proto_ops rose_proto_ops = { .sendmsg = rose_sendmsg, .recvmsg = rose_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct notifier_block rose_dev_notifier = { diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index da0b3b5157d5..f2cf4aa99db2 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -954,7 +954,6 @@ static const struct proto_ops rxrpc_rpc_ops = { .sendmsg = rxrpc_sendmsg, .recvmsg = rxrpc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct proto rxrpc_proto = { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 664d1f2e9121..274d07bd774f 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1133,7 +1133,6 @@ static const struct proto_ops inet_seqpacket_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; /* Registration with AF_INET family. */ diff --git a/net/socket.c b/net/socket.c index b778fc03c6e0..8c3c8b29995a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3552,54 +3552,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) } EXPORT_SYMBOL(kernel_getpeername); -/** - * kernel_sendpage - send a &page through a socket (kernel space) - * @sock: socket - * @page: page - * @offset: page offset - * @size: total size in bytes - * @flags: flags (MSG_DONTWAIT, ...) - * - * Returns the total amount sent in bytes or an error. - */ - -int kernel_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) -{ - if (sock->ops->sendpage) { - /* Warn in case the improper page to zero-copy send */ - WARN_ONCE(!sendpage_ok(page), "improper page for zero-copy send"); - return sock->ops->sendpage(sock, page, offset, size, flags); - } - return sock_no_sendpage(sock, page, offset, size, flags); -} -EXPORT_SYMBOL(kernel_sendpage); - -/** - * kernel_sendpage_locked - send a &page through the locked sock (kernel space) - * @sk: sock - * @page: page - * @offset: page offset - * @size: total size in bytes - * @flags: flags (MSG_DONTWAIT, ...) - * - * Returns the total amount sent in bytes or an error. - * Caller must hold @sk. - */ - -int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct socket *sock = sk->sk_socket; - - if (sock->ops->sendpage_locked) - return sock->ops->sendpage_locked(sk, page, offset, size, - flags); - - return sock_no_sendpage_locked(sk, page, offset, size, flags); -} -EXPORT_SYMBOL(kernel_sendpage_locked); - /** * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket diff --git a/net/tipc/socket.c b/net/tipc/socket.c index dd73d71c02a9..ef8e5139a873 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3375,7 +3375,6 @@ static const struct proto_ops msg_ops = { .sendmsg = tipc_sendmsg, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct proto_ops packet_ops = { @@ -3396,7 +3395,6 @@ static const struct proto_ops packet_ops = { .sendmsg = tipc_send_packet, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct proto_ops stream_ops = { @@ -3417,7 +3415,6 @@ static const struct proto_ops stream_ops = { .sendmsg = tipc_sendstream, .recvmsg = tipc_recvstream, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage }; static const struct net_proto_family tipc_family_ops = { diff --git a/net/tls/tls.h b/net/tls/tls.h index d002c3af1966..86cef1c68e03 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -98,10 +98,6 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_sw_splice_eof(struct socket *sock); -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); @@ -117,8 +113,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); int tls_tx_records(struct sock *sk, int flags); void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 975299d7213b..840ee06f1708 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -621,23 +621,6 @@ void tls_device_splice_eof(struct socket *sock) mutex_unlock(&tls_ctx->tx_lock); } -int tls_device_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - if (flags & MSG_OOB) - return -EOPNOTSUPP; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_device_sendmsg(sk, &msg, size); -} - struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7b9c83dd7de2..d5ed4d47b16e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -958,7 +958,6 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_SW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; ops[TLS_SW ][TLS_BASE].splice_eof = tls_sw_splice_eof; - ops[TLS_SW ][TLS_BASE].sendpage_locked = tls_sw_sendpage_locked; ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; @@ -970,17 +969,14 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; - ops[TLS_HW ][TLS_BASE].sendpage_locked = NULL; ops[TLS_HW ][TLS_SW ] = ops[TLS_BASE][TLS_SW ]; - ops[TLS_HW ][TLS_SW ].sendpage_locked = NULL; ops[TLS_BASE][TLS_HW ] = ops[TLS_BASE][TLS_SW ]; ops[TLS_SW ][TLS_HW ] = ops[TLS_SW ][TLS_SW ]; ops[TLS_HW ][TLS_HW ] = ops[TLS_HW ][TLS_SW ]; - ops[TLS_HW ][TLS_HW ].sendpage_locked = NULL; #endif #ifdef CONFIG_TLS_TOE ops[TLS_HW_RECORD][TLS_HW_RECORD] = *base; @@ -1029,7 +1025,6 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; prot[TLS_SW][TLS_BASE].splice_eof = tls_sw_splice_eof; - prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; @@ -1045,12 +1040,10 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_BASE].splice_eof = tls_device_splice_eof; - prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].splice_eof = tls_device_splice_eof; - prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 319f61590d2c..9b3aa89a4292 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1281,41 +1281,6 @@ unlock: mutex_unlock(&tls_ctx->tx_lock); } -int tls_sw_sendpage_locked(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY | - MSG_NO_SHARED_FRAGS)) - return -EOPNOTSUPP; - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_sw_sendmsg_locked(sk, &msg, size); -} - -int tls_sw_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, }; - - if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) - return -EOPNOTSUPP; - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return tls_sw_sendmsg(sk, &msg, size); -} - static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, bool released) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f9d196439b49..f2f234f0b92c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -758,8 +758,6 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); -static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, - size_t size, int flags); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); @@ -852,7 +850,6 @@ static const struct proto_ops unix_stream_ops = { .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, - .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, @@ -878,7 +875,6 @@ static const struct proto_ops unix_dgram_ops = { .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -902,7 +898,6 @@ static const struct proto_ops unix_seqpacket_ops = { .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_peek_off = unix_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; @@ -2294,20 +2289,6 @@ out_err: return sent ? : err; } -static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, - int offset, size_t size, int flags) -{ - struct bio_vec bvec; - struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; - - if (flags & MSG_SENDPAGE_NOTLAST) - msg.msg_flags |= MSG_MORE; - - bvec_set_page(&bvec, page, size, offset); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); - return unix_stream_sendmsg(socket, &msg, size); -} - static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index efb8a0937a13..020cf17ab7e4 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1306,7 +1306,6 @@ static const struct proto_ops vsock_dgram_ops = { .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .read_skb = vsock_read_skb, }; @@ -2234,7 +2233,6 @@ static const struct proto_ops vsock_stream_ops = { .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; @@ -2257,7 +2255,6 @@ static const struct proto_ops vsock_seqpacket_ops = { .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, .read_skb = vsock_read_skb, }; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5c7ad301d742..0fb5143bec7a 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1757,7 +1757,6 @@ static const struct proto_ops x25_proto_ops = { .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static struct packet_type x25_packet_type __read_mostly = { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cc1e7f15fa73..5a8c0dd250af 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1389,7 +1389,6 @@ static const struct proto_ops xsk_proto_ops = { .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) -- cgit v1.2.3