From b841b901c452d92610f739a36e54978453528876 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:10 +0100 Subject: net: Declare MSG_SPLICE_PAGES internal sendmsg() flag Declare MSG_SPLICE_PAGES, an internal sendmsg() flag, that hints to a network protocol that it should splice pages from the source iterator rather than copying the data if it can. This flag is added to a list that is cleared by sendmsg syscalls on entry. This is intended as a replacement for the ->sendpage() op, allowing a way to splice in several multipage folios in one go. Signed-off-by: David Howells Reviewed-by: Willem de Bruijn cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 13c3a237b9c9..bd1cc3238851 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -327,6 +327,7 @@ struct ucred { */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through @@ -337,6 +338,8 @@ struct ucred { #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif +/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ +#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 -- cgit v1.2.3 From 96449f90240713bd9bd653d6b15266a1044cfa7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:11 +0100 Subject: net: Pass max frags into skb_append_pagefrags() Pass the maximum number of fragments into skb_append_pagefrags() rather than using MAX_SKB_FRAGS so that it can be used from code that wants to specify sysctl_max_skb_frags. Signed-off-by: David Howells cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/unix/af_unix.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8cff3d817131..15011408c47c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1383,7 +1383,7 @@ static inline int skb_pad(struct sk_buff *skb, int pad) #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size); + int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6724a84ebb09..7f53dcb26ad3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4188,13 +4188,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 61892268e8a6..52fc840898d8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1450,7 +1450,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (len > size) len = size; - if (skb_append_pagefrags(skb, page, offset, len)) { + if (skb_append_pagefrags(skb, page, offset, len, + MAX_SKB_FRAGS)) { err = -EMSGSIZE; goto error; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cc695c9f09ec..dd55506b4632 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2349,7 +2349,7 @@ alloc_skb: newskb = NULL; } - if (skb_append_pagefrags(skb, page, offset, size)) { + if (skb_append_pagefrags(skb, page, offset, size, MAX_SKB_FRAGS)) { tail = skb; goto alloc_skb; } -- cgit v1.2.3 From 2e910b95329c2dc7feffbec00907f9e02d1a850a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:12 +0100 Subject: net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES Add a function to handle MSG_SPLICE_PAGES being passed internally to sendmsg(). Pages are spliced into the given socket buffer if possible and copied in if not (e.g. they're slab pages or have a zero refcount). Signed-off-by: David Howells cc: David Ahern cc: Al Viro cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 ++ net/core/skbuff.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 15011408c47c..1b2ebf6113e0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5097,5 +5097,8 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) #endif } +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp); + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7f53dcb26ad3..f4a5b51aed22 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6892,3 +6892,91 @@ nodefer: __kfree_skb(skb); if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * @gfp: Allocation flags + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp) +{ + size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr; + size_t off, len; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); -- cgit v1.2.3