From 559ae55cfc334c91c62d2ea054a3345611363ee0 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 9 May 2023 09:12:07 +0200 Subject: net: skbuff: remove special handling for SLOB Commit c9929f0e344a ("mm/slob: remove CONFIG_SLOB") removes CONFIG_SLOB. Now, we can also remove special handling for socket buffers with the SLOB allocator. The code with HAVE_SKB_SMALL_HEAD_CACHE=1 is now the default behavior for all allocators. Remove an unnecessary distinction between SLOB and SLAB/SLUB allocator after the SLOB allocator is gone. Signed-off-by: Lukas Bulwahn Reviewed-by: Leon Romanovsky Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230509071207.28942-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26a586007d8b..a82b33b1555e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -92,15 +92,7 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init; static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -/* skb_small_head_cache and related code is only supported - * for CONFIG_SLAB and CONFIG_SLUB. - * As soon as SLOB is removed from the kernel, we can clean up this. - */ -#if !defined(CONFIG_SLOB) -# define HAVE_SKB_SMALL_HEAD_CACHE 1 -#endif -#ifdef HAVE_SKB_SMALL_HEAD_CACHE static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) @@ -117,7 +109,6 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -#endif /* HAVE_SKB_SMALL_HEAD_CACHE */ int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -562,7 +553,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, void *obj; obj_size = SKB_HEAD_ALIGN(*size); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { obj = kmem_cache_alloc_node(skb_small_head_cache, @@ -576,7 +566,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); goto out; } -#endif *size = obj_size = kmalloc_size_roundup(obj_size); /* * Try a regular allocation, when that fails and we're not entitled @@ -898,11 +887,9 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) static void skb_kfree_head(void *head, unsigned int end_offset) { -#ifdef HAVE_SKB_SMALL_HEAD_CACHE if (end_offset == SKB_SMALL_HEAD_HEADROOM) kmem_cache_free(skb_small_head_cache, head); else -#endif kfree(head); } @@ -2160,7 +2147,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) if (likely(skb_end_offset(skb) == saved_end_offset)) return 0; -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). */ @@ -2174,7 +2160,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) WARN_ON_ONCE(1); return 0; } -#endif shinfo = skb_shinfo(skb); @@ -4768,7 +4753,6 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -#ifdef HAVE_SKB_SMALL_HEAD_CACHE /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. @@ -4780,7 +4764,6 @@ void __init skb_init(void) 0, SKB_SMALL_HEAD_HEADROOM, NULL); -#endif skb_extensions_init(); } -- cgit v1.2.3 From b51f4113ebb02011f0ca86abc3134b28d2071b6a Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Thu, 11 May 2023 09:12:12 +0800 Subject: net: introduce and use skb_frag_fill_page_desc() Most users use __skb_frag_set_page()/skb_frag_off_set()/ skb_frag_size_set() to fill the page desc for a skb frag. Introduce skb_frag_fill_page_desc() to do that. net/bpf/test_run.c does not call skb_frag_off_set() to set the offset, "copy_from_user(page_address(page), ...)" and 'shinfo' being part of the 'data' kzalloced in bpf_test_init() suggest that it is assuming offset to be initialized as zero, so call skb_frag_fill_page_desc() with offset being zero for this case. Also, skb_frag_set_page() is not used anymore, so remove it. Signed-off-by: Yunsheng Lin Reviewed-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 6 ++--- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++-- drivers/net/ethernet/chelsio/cxgb3/sge.c | 5 ++-- drivers/net/ethernet/emulex/benet/be_main.c | 32 +++++++++++++----------- drivers/net/ethernet/freescale/enetc/enetc.c | 5 ++-- drivers/net/ethernet/fungible/funeth/funeth_rx.c | 5 ++-- drivers/net/ethernet/marvell/mvneta.c | 5 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 +-- drivers/net/ethernet/sun/cassini.c | 8 ++---- drivers/net/virtio_net.c | 4 +-- drivers/net/vmxnet3/vmxnet3_drv.c | 4 +-- drivers/net/xen-netback/netback.c | 4 +-- include/linux/skbuff.h | 27 ++++++++------------ net/bpf/test_run.c | 3 +-- net/core/gro.c | 4 +-- net/core/pktgen.c | 13 ++++++---- net/core/skbuff.c | 7 +++--- net/tls/tls_device.c | 10 +++----- net/xfrm/xfrm_ipcomp.c | 5 +--- 19 files changed, 64 insertions(+), 92 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 7f933175cbda..4de22eed099a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -532,10 +532,10 @@ static bool aq_add_rx_fragment(struct device *dev, buff_->rxdata.pg_off, buff_->len, DMA_FROM_DEVICE); - skb_frag_off_set(frag, buff_->rxdata.pg_off); - skb_frag_size_set(frag, buff_->len); sinfo->xdp_frags_size += buff_->len; - __skb_frag_set_page(frag, buff_->rxdata.page); + skb_frag_fill_page_desc(frag, buff_->rxdata.page, + buff_->rxdata.pg_off, + buff_->len); buff_->is_cleaned = 1; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dcd9367f05af..efaff5018af8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1085,9 +1085,8 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; cons_rx_buf = &rxr->rx_agg_ring[cons]; - skb_frag_off_set(frag, cons_rx_buf->offset); - skb_frag_size_set(frag, frag_len); - __skb_frag_set_page(frag, cons_rx_buf->page); + skb_frag_fill_page_desc(frag, cons_rx_buf->page, + cons_rx_buf->offset, frag_len); shinfo->nr_frags = i + 1; __clear_bit(cons, rxr->rx_agg_bmap); diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index efa7f401529e..2e9a74fe0970 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -2184,9 +2184,8 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs, len -= offset; rx_frag += nr_frags; - __skb_frag_set_page(rx_frag, sd->pg_chunk.page); - skb_frag_off_set(rx_frag, sd->pg_chunk.offset + offset); - skb_frag_size_set(rx_frag, len); + skb_frag_fill_page_desc(rx_frag, sd->pg_chunk.page, + sd->pg_chunk.offset + offset, len); skb->len += len; skb->data_len += len; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7e408bcc88de..3164ed205cf7 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2343,11 +2343,10 @@ static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb, hdr_len = ETH_HLEN; memcpy(skb->data, start, hdr_len); skb_shinfo(skb)->nr_frags = 1; - skb_frag_set_page(skb, 0, page_info->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[0], - page_info->page_offset + hdr_len); - skb_frag_size_set(&skb_shinfo(skb)->frags[0], - curr_frag_len - hdr_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[0], + page_info->page, + page_info->page_offset + hdr_len, + curr_frag_len - hdr_len); skb->data_len = curr_frag_len - hdr_len; skb->truesize += rx_frag_size; skb->tail += hdr_len; @@ -2369,16 +2368,17 @@ static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb, if (page_info->page_offset == 0) { /* Fresh page */ j++; - skb_frag_set_page(skb, j, page_info->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[j], - page_info->page_offset); - skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[j], + page_info->page, + page_info->page_offset, + curr_frag_len); skb_shinfo(skb)->nr_frags++; } else { put_page(page_info->page); + skb_frag_size_add(&skb_shinfo(skb)->frags[j], + curr_frag_len); } - skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len); skb->len += curr_frag_len; skb->data_len += curr_frag_len; skb->truesize += rx_frag_size; @@ -2451,14 +2451,16 @@ static void be_rx_compl_process_gro(struct be_rx_obj *rxo, if (i == 0 || page_info->page_offset == 0) { /* First frag or Fresh page */ j++; - skb_frag_set_page(skb, j, page_info->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[j], - page_info->page_offset); - skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[j], + page_info->page, + page_info->page_offset, + curr_frag_len); } else { put_page(page_info->page); + skb_frag_size_add(&skb_shinfo(skb)->frags[j], + curr_frag_len); } - skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len); + skb->truesize += rx_frag_size; remaining -= curr_frag_len; memset(page_info, 0, sizeof(*page_info)); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 3c4fa26f0f9b..63854294ac33 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1445,9 +1445,8 @@ static void enetc_add_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, xdp_buff_set_frag_pfmemalloc(xdp_buff); frag = &shinfo->frags[shinfo->nr_frags]; - skb_frag_off_set(frag, rx_swbd->page_offset); - skb_frag_size_set(frag, size); - __skb_frag_set_page(frag, rx_swbd->page); + skb_frag_fill_page_desc(frag, rx_swbd->page, rx_swbd->page_offset, + size); shinfo->nr_frags++; } diff --git a/drivers/net/ethernet/fungible/funeth/funeth_rx.c b/drivers/net/ethernet/fungible/funeth/funeth_rx.c index 29a6c2ede43a..7e2584895de3 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_rx.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c @@ -323,9 +323,8 @@ static int fun_gather_pkt(struct funeth_rxq *q, unsigned int tot_len, if (ref_ok) ref_ok |= buf->node; - __skb_frag_set_page(frags, buf->page); - skb_frag_off_set(frags, q->buf_offset); - skb_frag_size_set(frags++, frag_len); + skb_frag_fill_page_desc(frags++, buf->page, q->buf_offset, + frag_len); tot_len -= frag_len; if (!tot_len) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 6c6b66d3ea6e..e2abc00d0472 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2376,9 +2376,8 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp, if (data_len > 0 && sinfo->nr_frags < MAX_SKB_FRAGS) { skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags++]; - skb_frag_off_set(frag, pp->rx_offset_correction); - skb_frag_size_set(frag, data_len); - __skb_frag_set_page(frag, page); + skb_frag_fill_page_desc(frag, page, + pp->rx_offset_correction, data_len); if (!xdp_buff_has_frags(xdp)) { sinfo->xdp_frags_size = *size; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 69634829558e..704b022cd1f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -491,9 +491,7 @@ mlx5e_add_skb_shared_info_frag(struct mlx5e_rq *rq, struct skb_shared_info *sinf } frag = &sinfo->frags[sinfo->nr_frags++]; - __skb_frag_set_page(frag, frag_page->page); - skb_frag_off_set(frag, frag_offset); - skb_frag_size_set(frag, len); + skb_frag_fill_page_desc(frag, frag_page->page, frag_offset, len); if (page_is_pfmemalloc(frag_page->page)) xdp_buff_set_frag_pfmemalloc(xdp); diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index 4ef05bad4613..2d52f54ebb45 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -1998,10 +1998,8 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc, skb->truesize += hlen - swivel; skb->len += hlen - swivel; - __skb_frag_set_page(frag, page->buffer); + skb_frag_fill_page_desc(frag, page->buffer, off, hlen - swivel); __skb_frag_ref(frag); - skb_frag_off_set(frag, off); - skb_frag_size_set(frag, hlen - swivel); /* any more data? */ if ((words[0] & RX_COMP1_SPLIT_PKT) && ((dlen -= hlen) > 0)) { @@ -2024,10 +2022,8 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc, skb->len += hlen; frag++; - __skb_frag_set_page(frag, page->buffer); + skb_frag_fill_page_desc(frag, page->buffer, 0, hlen); __skb_frag_ref(frag); - skb_frag_off_set(frag, 0); - skb_frag_size_set(frag, hlen); RX_USED_ADD(page, hlen + cp->crc_size); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 97241006d64a..29eccc8ff41f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1272,9 +1272,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, } frag = &shinfo->frags[shinfo->nr_frags++]; - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, offset); - skb_frag_size_set(frag, len); + skb_frag_fill_page_desc(frag, page, offset, len); if (page_is_pfmemalloc(page)) xdp_buff_set_frag_pfmemalloc(xdp); diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index f2b76ee866a4..7fa74b8b2100 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -686,9 +686,7 @@ vmxnet3_append_frag(struct sk_buff *skb, struct Vmxnet3_RxCompDesc *rcd, BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); - __skb_frag_set_page(frag, rbi->page); - skb_frag_off_set(frag, 0); - skb_frag_size_set(frag, rcd->len); + skb_frag_fill_page_desc(frag, rbi->page, 0, rcd->len); skb->data_len += rcd->len; skb->truesize += PAGE_SIZE; skb_shinfo(skb)->nr_frags++; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index c1501f41e2d8..3d79b35eb577 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1128,9 +1128,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s BUG(); offset += len; - __skb_frag_set_page(&frags[i], page); - skb_frag_off_set(&frags[i], 0); - skb_frag_size_set(&frags[i], len); + skb_frag_fill_page_desc(&frags[i], page, 0, len); } /* Release all the original (foreign) frags. */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 738776ab8838..30be21c7d05f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2411,6 +2411,15 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) return skb_headlen(skb) + __skb_pagelen(skb); } +static inline void skb_frag_fill_page_desc(skb_frag_t *frag, + struct page *page, + int off, int size) +{ + frag->bv_page = page; + frag->bv_offset = off; + skb_frag_size_set(frag, size); +} + static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) @@ -2422,9 +2431,7 @@ static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ - frag->bv_page = page; - frag->bv_offset = off; - skb_frag_size_set(frag, size); + skb_frag_fill_page_desc(frag, page, off, size); } /** @@ -3496,20 +3503,6 @@ static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page) frag->bv_page = page; } -/** - * skb_frag_set_page - sets the page contained in a paged fragment of an skb - * @skb: the buffer - * @f: the fragment offset - * @page: the page to set - * - * Sets the @f'th fragment of @skb to contain @page. - */ -static inline void skb_frag_set_page(struct sk_buff *skb, int f, - struct page *page) -{ - __skb_frag_set_page(&skb_shinfo(skb)->frags[f], page); -} - bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e79e3a415ca9..98143b86a9dd 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1415,11 +1415,10 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, } frag = &sinfo->frags[sinfo->nr_frags++]; - __skb_frag_set_page(frag, page); data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); - skb_frag_size_set(frag, data_len); + skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { diff --git a/net/core/gro.c b/net/core/gro.c index 2d84165cb4f1..6783a47a6136 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -239,9 +239,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); + skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 760238196db1..f56b8d697014 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2785,14 +2785,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, break; } get_page(pkt_dev->page); - skb_frag_set_page(skb, i, pkt_dev->page); - skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); + /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_frag_size_set(&skb_shinfo(skb)->frags[i], - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, + (datalen < PAGE_SIZE ? + datalen : PAGE_SIZE)); else - skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], + pkt_dev->page, 0, frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 01b48e68aca0..6724a84ebb09 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4234,10 +4234,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) struct page *page; page = virt_to_head_page(frag_skb->head); - __skb_frag_set_page(&head_frag, page); - skb_frag_off_set(&head_frag, frag_skb->data - - (unsigned char *)page_address(page)); - skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); + skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - + (unsigned char *)page_address(page), + skb_headlen(frag_skb)); return head_frag; } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7cc4f9faac2..daeff54bdbfa 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -268,9 +268,8 @@ static void tls_append_frag(struct tls_record_info *record, skb_frag_size_add(frag, size); } else { ++frag; - __skb_frag_set_page(frag, pfrag->page); - skb_frag_off_set(frag, pfrag->offset); - skb_frag_size_set(frag, size); + skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, + size); ++record->num_frags; get_page(pfrag->page); } @@ -357,9 +356,8 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, return -ENOMEM; frag = &record->frags[0]; - __skb_frag_set_page(frag, pfrag->page); - skb_frag_off_set(frag, pfrag->offset); - skb_frag_size_set(frag, prepend_size); + skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, + prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 80143360bf09..9c0fa0e1786a 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -74,14 +74,11 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) if (!page) return -ENOMEM; - __skb_frag_set_page(frag, page); - len = PAGE_SIZE; if (dlen < len) len = dlen; - skb_frag_off_set(frag, 0); - skb_frag_size_set(frag, len); + skb_frag_fill_page_desc(frag, page, 0, len); memcpy(skb_frag_address(frag), scratch, len); skb->truesize += len; -- cgit v1.2.3 From 96449f90240713bd9bd653d6b15266a1044cfa7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:11 +0100 Subject: net: Pass max frags into skb_append_pagefrags() Pass the maximum number of fragments into skb_append_pagefrags() rather than using MAX_SKB_FRAGS so that it can be used from code that wants to specify sysctl_max_skb_frags. Signed-off-by: David Howells cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/unix/af_unix.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8cff3d817131..15011408c47c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1383,7 +1383,7 @@ static inline int skb_pad(struct sk_buff *skb, int pad) #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size); + int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6724a84ebb09..7f53dcb26ad3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4188,13 +4188,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); int skb_append_pagefrags(struct sk_buff *skb, struct page *page, - int offset, size_t size) + int offset, size_t size, size_t max_frags) { int i = skb_shinfo(skb)->nr_frags; if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); - } else if (i < MAX_SKB_FRAGS) { + } else if (i < max_frags) { skb_zcopy_downgrade_managed(skb); get_page(page); skb_fill_page_desc_noacc(skb, i, page, offset, size); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 61892268e8a6..52fc840898d8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1450,7 +1450,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (len > size) len = size; - if (skb_append_pagefrags(skb, page, offset, len)) { + if (skb_append_pagefrags(skb, page, offset, len, + MAX_SKB_FRAGS)) { err = -EMSGSIZE; goto error; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cc695c9f09ec..dd55506b4632 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2349,7 +2349,7 @@ alloc_skb: newskb = NULL; } - if (skb_append_pagefrags(skb, page, offset, size)) { + if (skb_append_pagefrags(skb, page, offset, size, MAX_SKB_FRAGS)) { tail = skb; goto alloc_skb; } -- cgit v1.2.3 From 2e910b95329c2dc7feffbec00907f9e02d1a850a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:12 +0100 Subject: net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES Add a function to handle MSG_SPLICE_PAGES being passed internally to sendmsg(). Pages are spliced into the given socket buffer if possible and copied in if not (e.g. they're slab pages or have a zero refcount). Signed-off-by: David Howells cc: David Ahern cc: Al Viro cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 ++ net/core/skbuff.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 15011408c47c..1b2ebf6113e0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5097,5 +5097,8 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) #endif } +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp); + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7f53dcb26ad3..f4a5b51aed22 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6892,3 +6892,91 @@ nodefer: __kfree_skb(skb); if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } + +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, + size_t offset, size_t len) +{ + const char *kaddr; + __wsum csum; + + kaddr = kmap_local_page(page); + csum = csum_partial(kaddr + offset, len, 0); + kunmap_local(kaddr); + skb->csum = csum_block_add(skb->csum, csum, skb->len); +} + +/** + * skb_splice_from_iter - Splice (or copy) pages to skbuff + * @skb: The buffer to add pages to + * @iter: Iterator representing the pages to be added + * @maxsize: Maximum amount of pages to be added + * @gfp: Allocation flags + * + * This is a common helper function for supporting MSG_SPLICE_PAGES. It + * extracts pages from an iterator and adds them to the socket buffer if + * possible, copying them to fragments if not possible (such as if they're slab + * pages). + * + * Returns the amount of data spliced/copied or -EMSGSIZE if there's + * insufficient space in the buffer to transfer anything. + */ +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, + ssize_t maxsize, gfp_t gfp) +{ + size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + struct page *pages[8], **ppages = pages; + ssize_t spliced = 0, ret = 0; + unsigned int i; + + while (iter->count > 0) { + ssize_t space, nr; + size_t off, len; + + ret = -EMSGSIZE; + space = frag_limit - skb_shinfo(skb)->nr_frags; + if (space < 0) + break; + + /* We might be able to coalesce without increasing nr_frags */ + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); + + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); + if (len <= 0) { + ret = len ?: -EIO; + break; + } + + i = 0; + do { + struct page *page = pages[i++]; + size_t part = min_t(size_t, PAGE_SIZE - off, len); + + ret = -EIO; + if (WARN_ON_ONCE(!sendpage_ok(page))) + goto out; + + ret = skb_append_pagefrags(skb, page, off, part, + frag_limit); + if (ret < 0) { + iov_iter_revert(iter, len); + goto out; + } + + if (skb->ip_summed == CHECKSUM_NONE) + skb_splice_csum_page(skb, page, off, part); + + off = 0; + spliced += part; + maxsize -= part; + len -= part; + } while (len > 0); + + if (maxsize <= 0) + break; + } + +out: + skb_len_add(skb, spliced); + return spliced ?: ret; +} +EXPORT_SYMBOL(skb_splice_from_iter); -- cgit v1.2.3 From 8a02fb71d7192ff1a9a47c9d937624966c6e09af Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 22 May 2023 17:30:20 +0200 Subject: net: fix skb leak in __skb_tstamp_tx() Commit 50749f2dd685 ("tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp.") added a call to skb_orphan_frags_rx() to fix leaks with zerocopy skbs. But it ended up adding a leak of its own. When skb_orphan_frags_rx() fails, the function just returns, leaking the skb it just cloned. Free it before returning. This bug was discovered and resolved using Coverity Static Analysis Security Testing (SAST) by Synopsys, Inc. Fixes: 50749f2dd685 ("tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp.") Signed-off-by: Pratyush Yadav Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230522153020.32422-1-ptyadav@amazon.de Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 515ec5cdc79c..cea28d30abb5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5224,8 +5224,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, } else { skb = skb_clone(orig_skb, GFP_ATOMIC); - if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); return; + } } if (!skb) return; -- cgit v1.2.3 From ef1bc119ceb52d22a83f72b8dfce1dd64a3cca05 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 May 2023 16:39:15 +0300 Subject: net: fix signedness bug in skb_splice_from_iter() The "len" variable needs to be signed for the error handling to work correctly. Fixes: 2e910b95329c ("net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES") Signed-off-by: Dan Carpenter Reviewed-by: David Howells Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/366861a7-87c8-4bbf-9101-69dd41021d07@kili.mountain Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 653abd8a6975..7c4338221b17 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6931,8 +6931,8 @@ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, unsigned int i; while (iter->count > 0) { - ssize_t space, nr; - size_t off, len; + ssize_t space, nr, len; + size_t off; ret = -EMSGSIZE; space = frag_limit - skb_shinfo(skb)->nr_frags; -- cgit v1.2.3 From d457a0e329b0bfd3a1450e0b1a18cd2b47a25a08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Jun 2023 19:17:37 +0000 Subject: net: move gso declarations and functions to their own files Move declarations into include/net/gso.h and code into net/core/gso.c Signed-off-by: Eric Dumazet Cc: Stanislav Fomichev Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230608191738.3947077-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 1 + drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 1 + drivers/net/ethernet/sfc/siena/tx_common.c | 1 + drivers/net/ethernet/sfc/tx_common.c | 1 + drivers/net/tap.c | 1 + drivers/net/usb/r8152.c | 1 + drivers/net/wireguard/device.c | 1 + drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 1 + include/linux/netdevice.h | 26 +-- include/linux/skbuff.h | 71 ------ include/net/gro.h | 1 + include/net/gso.h | 109 +++++++++ include/net/udp.h | 1 + net/core/Makefile | 2 +- net/core/dev.c | 70 +----- net/core/gro.c | 59 +---- net/core/gso.c | 273 +++++++++++++++++++++++ net/core/skbuff.c | 142 +----------- net/ipv4/af_inet.c | 1 + net/ipv4/esp4_offload.c | 1 + net/ipv4/gre_offload.c | 1 + net/ipv4/ip_output.c | 1 + net/ipv4/tcp_offload.c | 1 + net/ipv4/udp.c | 1 + net/ipv4/udp_offload.c | 1 + net/ipv6/esp6_offload.c | 1 + net/ipv6/ip6_offload.c | 1 + net/ipv6/ip6_output.c | 1 + net/ipv6/udp_offload.c | 1 + net/mac80211/tx.c | 1 + net/mpls/af_mpls.c | 1 + net/mpls/mpls_gso.c | 1 + net/netfilter/nf_flow_table_ip.c | 1 + net/netfilter/nfnetlink_queue.c | 1 + net/nsh/nsh.c | 1 + net/openvswitch/actions.c | 1 + net/openvswitch/datapath.c | 1 + net/sched/act_police.c | 1 + net/sched/sch_cake.c | 1 + net/sched/sch_netem.c | 1 + net/sched/sch_taprio.c | 1 + net/sched/sch_tbf.c | 1 + net/sctp/offload.c | 1 + net/xfrm/xfrm_device.c | 1 + net/xfrm/xfrm_interface_core.c | 1 + net/xfrm/xfrm_output.c | 1 + 46 files changed, 425 insertions(+), 365 deletions(-) create mode 100644 include/net/gso.h create mode 100644 net/core/gso.c (limited to 'net/core/skbuff.c') diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 58747292521d..5e68a6a4b2af 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -57,6 +57,7 @@ #include #include +#include #include #include diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index c5687d94ea88..7b7e1c5b00f4 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/sfc/siena/tx_common.c b/drivers/net/ethernet/sfc/siena/tx_common.c index 93a32d61944f..a7a9ab304e13 100644 --- a/drivers/net/ethernet/sfc/siena/tx_common.c +++ b/drivers/net/ethernet/sfc/siena/tx_common.c @@ -12,6 +12,7 @@ #include "efx.h" #include "nic_common.h" #include "tx_common.h" +#include static unsigned int efx_tx_cb_page_count(struct efx_tx_queue *tx_queue) { diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 67e789b96c43..4ce7d00e697d 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -12,6 +12,7 @@ #include "efx.h" #include "nic_common.h" #include "tx_common.h" +#include static unsigned int efx_tx_cb_page_count(struct efx_tx_queue *tx_queue) { diff --git a/drivers/net/tap.c b/drivers/net/tap.c index d30d730ed5a7..9137fb8c1c42 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 0999a58ca9d2..0738baa5b82e 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Information for net-next */ #define NETNEXT_VERSION "12" diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index d58e9f818d3b..258dcc103921 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 5fa6f98b8e55..ef0f53b3b89f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2f0c6002a84..2d6cb2bf2f05 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4827,13 +4827,6 @@ int skb_crc32c_csum_help(struct sk_buff *skb); int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features); -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path); -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, - netdev_features_t features, __be16 type); -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features); - struct netdev_bonding_info { ifslave slave; ifbond master; @@ -4856,11 +4849,6 @@ static inline void ethtool_notify(struct net_device *dev, unsigned int cmd, } #endif -static inline -struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - return __skb_gso_segment(skb, features, true); -} __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, @@ -4987,6 +4975,7 @@ netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); netdev_features_t netif_skb_features(struct sk_buff *skb); +void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { @@ -5035,19 +5024,6 @@ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); -static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, - int pulled_hlen, u16 mac_offset, - int mac_len) -{ - skb->protocol = protocol; - skb->encapsulation = 1; - skb_push(skb, pulled_hlen); - skb_reset_transport_header(skb); - skb->mac_header = mac_offset; - skb->network_header = skb->mac_header + mac_len; - skb->mac_len = mac_len; -} - static inline bool netif_is_macsec(const struct net_device *dev) { return dev->priv_flags & IFF_MACSEC; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e2f48ddb2f7c..91ed66952580 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3974,8 +3974,6 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); @@ -4841,75 +4839,6 @@ static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) #endif } -/* Keeps track of mac header offset relative to skb->head. - * It is useful for TSO of Tunneling protocol. e.g. GRE. - * For non-tunnel skb it points to skb_mac_header() and for - * tunnel skb it points to outer mac header. - * Keeps track of level of encapsulation of network headers. - */ -struct skb_gso_cb { - union { - int mac_offset; - int data_offset; - }; - int encap_level; - __wsum csum; - __u16 csum_start; -}; -#define SKB_GSO_CB_OFFSET 32 -#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) - -static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) -{ - return (skb_mac_header(inner_skb) - inner_skb->head) - - SKB_GSO_CB(inner_skb)->mac_offset; -} - -static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) -{ - int new_headroom, headroom; - int ret; - - headroom = skb_headroom(skb); - ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); - if (ret) - return ret; - - new_headroom = skb_headroom(skb); - SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); - return 0; -} - -static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) -{ - /* Do not update partial checksums if remote checksum is enabled. */ - if (skb->remcsum_offload) - return; - - SKB_GSO_CB(skb)->csum = res; - SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; -} - -/* Compute the checksum for a gso segment. First compute the checksum value - * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and - * then add in skb->csum (checksum from csum_start to end of packet). - * skb->csum and csum_start are then updated to reflect the checksum of the - * resultant packet starting from the transport header-- the resultant checksum - * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo - * header. - */ -static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) -{ - unsigned char *csum_start = skb_transport_header(skb); - int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; - __wsum partial = SKB_GSO_CB(skb)->csum; - - SKB_GSO_CB(skb)->csum = res; - SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; - - return csum_fold(csum_partial(csum_start, plen, partial)); -} - static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/include/net/gro.h b/include/net/gro.h index 7b47dd6ce94f..75efa6fb8441 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -452,5 +452,6 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, gro_normal_list(napi); } +extern struct list_head offload_base; #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/gso.h b/include/net/gso.h new file mode 100644 index 000000000000..29975440cad5 --- /dev/null +++ b/include/net/gso.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_GSO_H +#define _NET_GSO_H + +#include + +/* Keeps track of mac header offset relative to skb->head. + * It is useful for TSO of Tunneling protocol. e.g. GRE. + * For non-tunnel skb it points to skb_mac_header() and for + * tunnel skb it points to outer mac header. + * Keeps track of level of encapsulation of network headers. + */ +struct skb_gso_cb { + union { + int mac_offset; + int data_offset; + }; + int encap_level; + __wsum csum; + __u16 csum_start; +}; +#define SKB_GSO_CB_OFFSET 32 +#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) + +static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) +{ + return (skb_mac_header(inner_skb) - inner_skb->head) - + SKB_GSO_CB(inner_skb)->mac_offset; +} + +static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) +{ + int new_headroom, headroom; + int ret; + + headroom = skb_headroom(skb); + ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); + if (ret) + return ret; + + new_headroom = skb_headroom(skb); + SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); + return 0; +} + +static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) +{ + /* Do not update partial checksums if remote checksum is enabled. */ + if (skb->remcsum_offload) + return; + + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; +} + +/* Compute the checksum for a gso segment. First compute the checksum value + * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and + * then add in skb->csum (checksum from csum_start to end of packet). + * skb->csum and csum_start are then updated to reflect the checksum of the + * resultant packet starting from the transport header-- the resultant checksum + * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo + * header. + */ +static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) +{ + unsigned char *csum_start = skb_transport_header(skb); + int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; + __wsum partial = SKB_GSO_CB(skb)->csum; + + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; + + return csum_fold(csum_partial(csum_start, plen, partial)); +} + +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path); + +static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + return __skb_gso_segment(skb, features, true); +} + +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); + +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features); + +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); + +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); + +static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, + int pulled_hlen, u16 mac_offset, + int mac_len) +{ + skb->protocol = protocol; + skb->encapsulation = 1; + skb_push(skb, pulled_hlen); + skb_reset_transport_header(skb); + skb->mac_header = mac_offset; + skb->network_header = skb->mac_header + mac_len; + skb->mac_len = mac_len; +} + +#endif /* _NET_GSO_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 4ed0b47c5582..e01340a27155 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/Makefile b/net/core/Makefile index 8f367813bc68..731db2eaa610 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o gro.o \ - netdev-genl.o netdev-genl-gen.o + netdev-genl.o netdev-genl-gen.o gso.o obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o diff --git a/net/core/dev.c b/net/core/dev.c index 6d6f8a7fe6b4..c2456b3667fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3209,7 +3209,7 @@ static u16 skb_tx_hash(const struct net_device *dev, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -static void skb_warn_bad_offload(const struct sk_buff *skb) +void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; @@ -3338,74 +3338,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return vlan_get_protocol_and_depth(skb, type, depth); } -/* openvswitch calls this on rx path, so we need a different check. - */ -static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) -{ - if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; - - return skb->ip_summed == CHECKSUM_NONE; -} - -/** - * __skb_gso_segment - Perform segmentation on skb. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @tx_path: whether it is called in TX path - * - * This function segments the given skb and returns a list of segments. - * - * It may return NULL if the skb requires no segmentation. This is - * only possible when GSO is used for verifying header integrity. - * - * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. - */ -struct sk_buff *__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, bool tx_path) -{ - struct sk_buff *segs; - - if (unlikely(skb_needs_check(skb, tx_path))) { - int err; - - /* We're going to init ->check field in TCP or UDP header */ - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - } - - /* Only report GSO partial support if it will enable us to - * support segmentation on this frame without needing additional - * work. - */ - if (features & NETIF_F_GSO_PARTIAL) { - netdev_features_t partial_features = NETIF_F_GSO_ROBUST; - struct net_device *dev = skb->dev; - - partial_features |= dev->features & dev->gso_partial_features; - if (!skb_gso_ok(skb, features | partial_features)) - features &= ~NETIF_F_GSO_PARTIAL; - } - - BUILD_BUG_ON(SKB_GSO_CB_OFFSET + - sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); - - SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); - SKB_GSO_CB(skb)->encap_level = 0; - - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - segs = skb_mac_gso_segment(skb, features); - - if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) - skb_warn_bad_offload(skb); - - return segs; -} -EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG diff --git a/net/core/gro.c b/net/core/gro.c index 4d45f78e2fac..dca800068e41 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -10,7 +10,7 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ int gro_normal_batch __read_mostly = 8; @@ -92,63 +92,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/** - * skb_eth_gso_segment - segmentation handler for ethernet protocols. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - * @type: Ethernet Protocol ID - */ -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, - netdev_features_t features, __be16 type) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - return segs; -} -EXPORT_SYMBOL(skb_eth_gso_segment); - -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { diff --git a/net/core/gso.c b/net/core/gso.c new file mode 100644 index 000000000000..9e1803bfc9c6 --- /dev/null +++ b/net/core/gso.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); +/* openvswitch calls this on rx path, so we need a different check. + */ +static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7c4338221b17..fee2b1c105fe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -5766,147 +5767,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) } EXPORT_SYMBOL_GPL(skb_scrub_packet); -/** - * skb_gso_transport_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_transport_seglen is used to determine the real size of the - * individual segments, including Layer4 headers (TCP/UDP). - * - * The MAC/L2 or network (IP, IPv6) headers are not accounted for. - */ -static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) -{ - const struct skb_shared_info *shinfo = skb_shinfo(skb); - unsigned int thlen = 0; - - if (skb->encapsulation) { - thlen = skb_inner_transport_header(skb) - - skb_transport_header(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - thlen += inner_tcp_hdrlen(skb); - } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { - thlen = tcp_hdrlen(skb); - } else if (unlikely(skb_is_gso_sctp(skb))) { - thlen = sizeof(struct sctphdr); - } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { - thlen = sizeof(struct udphdr); - } - /* UFO sets gso_size to the size of the fragmentation - * payload, i.e. the size of the L4 (UDP) header is already - * accounted for. - */ - return thlen + shinfo->gso_size; -} - -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - - skb_network_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS - * - * There are a couple of instances where we have a GSO skb, and we - * want to determine what size it would be after it is segmented. - * - * We might want to check: - * - L3+L4+payload size (e.g. IP forwarding) - * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) - * - * This is a helper to do that correctly considering GSO_BY_FRAGS. - * - * @skb: GSO skb - * - * @seg_len: The segmented length (from skb_gso_*_seglen). In the - * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. - * - * @max_len: The maximum permissible length. - * - * Returns true if the segmented length <= max length. - */ -static inline bool skb_gso_size_check(const struct sk_buff *skb, - unsigned int seg_len, - unsigned int max_len) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); - const struct sk_buff *iter; - - if (shinfo->gso_size != GSO_BY_FRAGS) - return seg_len <= max_len; - - /* Undo this so we can re-use header sizes */ - seg_len -= GSO_BY_FRAGS; - - skb_walk_frags(skb, iter) { - if (seg_len + skb_headlen(iter) > max_len) - return false; - } - - return true; -} - -/** - * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? - * - * @skb: GSO skb - * @mtu: MTU to validate against - * - * skb_gso_validate_network_len validates if a given skb will fit a - * wanted MTU once split. It considers L3 headers, L4 headers, and the - * payload. - */ -bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) -{ - return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); - -/** - * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? - * - * @skb: GSO skb - * @len: length to validate against - * - * skb_gso_validate_mac_len validates if a given skb will fit a wanted - * length once split, including L2, L3 and L4 headers and the payload. - */ -bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) -{ - return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); -} -EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); - static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { int mac_len, meta_len; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fd233c4195ac..0e16ac8282c5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 3969fa805679..12c5fb3c6e1e 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2b9cb5398335..311e70bfce40 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@ #include #include #include +#include static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 244fb9365d87..457598dfa128 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 05b38f58b404..8311c38267b5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index df5e407286d7..7e0542c10471 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 1f01e15ca24f..75aa4de5b731 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 75c02992c520..b33c7de5bdbc 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 00dc2e3b0184..d6314287338d 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "ip6_offload.h" diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c722cb881b2d..c06ff7519f19 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index c39c1e32f980..ad3b8726873e 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -14,6 +14,7 @@ #include #include "ip6_offload.h" #include +#include static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 148a0e2aa740..cfbe4beb8f1c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index dc5165d3eec4..bf6e81d56263 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 1482259de9b5..533d082f0701 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -14,6 +14,7 @@ #include #include #include +#include #include static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index d248763917ad..d885d34edfe1 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index e311462f6d98..556bc902af00 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 0f23e5e8e03e..f4a38bd6a7e0 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a8cf9a88758e..8074ea00d577 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 58f530f60172..a6d2a0b1aa21 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 2e9dce03d1ec..f3121c5a85e9 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 891e007d5c0b..9cff99558694 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 6ef3021e1169..0c9e93d66c50 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 5076da103f63..4a4e6ff894c1 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 277ad11f4d61..17d2d00ddb18 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/offload.c b/net/sctp/offload.c index eb874e3c399a..502095173d88 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -22,6 +22,7 @@ #include #include #include +#include static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 408f5e55744e..533697e2488f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..0ee864a76579 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 369e5de8558f..662c83beb345 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From c729ed6f5be57b4d164dbbc415554c066e29306b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2023 23:54:59 +0100 Subject: net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock() Use sendmsg() with MSG_SPLICE_PAGES rather than sendpage in skb_send_sock(). This causes pages to be spliced from the source iterator if possible. This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Note that this could perhaps be improved to fill out a bvec array with all the frags and then make a single sendmsg call, possibly sticking the header on the front also. Signed-off-by: David Howells cc: Jens Axboe cc: Matthew Wilcox Link: https://lore.kernel.org/r/20230623225513.2732256-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fee2b1c105fe..6c5915efbc17 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2989,32 +2989,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size) +static int sendmsg_locked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; + size_t size = msg_data_left(msg); if (!sock) return -EINVAL; - return kernel_sendmsg(sock, msg, vec, num, size); + + if (!sock->ops->sendmsg_locked) + return sock_no_sendmsg_locked(sk, msg, size); + + return sock->ops->sendmsg_locked(sk, msg, size); } -static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) { struct socket *sock = sk->sk_socket; if (!sock) return -EINVAL; - return kernel_sendpage(sock, page, offset, size, flags); + return sock_sendmsg(sock, msg); } -typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size); -typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, - size_t size, int flags); +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg, sendpage_func sendpage) + int len, sendmsg_func sendmsg) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3034,8 +3034,9 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, - sendmsg_unlocked, sk, &msg, &kv, 1, slen); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -3066,11 +3067,18 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, - sendpage_unlocked, sk, - skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + }; + + bvec_set_page(&bvec, skb_frag_page(frag), slen, + skb_frag_off(frag) + offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, + slen); + + ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, + sendmsg_unlocked, sk, &msg); if (ret <= 0) goto error; @@ -3107,16 +3115,14 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, - kernel_sendpage_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, - sendpage_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); } /** -- cgit v1.2.3 From c329b261afe71197d9da83c1f18eb45a7e97e089 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 7 Jul 2023 10:11:10 +0200 Subject: net: prevent skb corruption on frag list segmentation Ian reported several skb corruptions triggered by rx-gro-list, collecting different oops alike: [ 62.624003] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 62.631083] #PF: supervisor read access in kernel mode [ 62.636312] #PF: error_code(0x0000) - not-present page [ 62.641541] PGD 0 P4D 0 [ 62.644174] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 62.648629] CPU: 1 PID: 913 Comm: napi/eno2-79 Not tainted 6.4.0 #364 [ 62.655162] Hardware name: Supermicro Super Server/A2SDi-12C-HLN4F, BIOS 1.7a 10/13/2022 [ 62.663344] RIP: 0010:__udp_gso_segment (./include/linux/skbuff.h:2858 ./include/linux/udp.h:23 net/ipv4/udp_offload.c:228 net/ipv4/udp_offload.c:261 net/ipv4/udp_offload.c:277) [ 62.687193] RSP: 0018:ffffbd3a83b4f868 EFLAGS: 00010246 [ 62.692515] RAX: 00000000000000ce RBX: 0000000000000000 RCX: 0000000000000000 [ 62.699743] RDX: ffffa124def8a000 RSI: 0000000000000079 RDI: ffffa125952a14d4 [ 62.706970] RBP: ffffa124def8a000 R08: 0000000000000022 R09: 00002000001558c9 [ 62.714199] R10: 0000000000000000 R11: 00000000be554639 R12: 00000000000000e2 [ 62.721426] R13: ffffa125952a1400 R14: ffffa125952a1400 R15: 00002000001558c9 [ 62.728654] FS: 0000000000000000(0000) GS:ffffa127efa40000(0000) knlGS:0000000000000000 [ 62.736852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 62.742702] CR2: 00000000000000c0 CR3: 00000001034b0000 CR4: 00000000003526e0 [ 62.749948] Call Trace: [ 62.752498] [ 62.779267] inet_gso_segment (net/ipv4/af_inet.c:1398) [ 62.787605] skb_mac_gso_segment (net/core/gro.c:141) [ 62.791906] __skb_gso_segment (net/core/dev.c:3403 (discriminator 2)) [ 62.800492] validate_xmit_skb (./include/linux/netdevice.h:4862 net/core/dev.c:3659) [ 62.804695] validate_xmit_skb_list (net/core/dev.c:3710) [ 62.809158] sch_direct_xmit (net/sched/sch_generic.c:330) [ 62.813198] __dev_queue_xmit (net/core/dev.c:3805 net/core/dev.c:4210) net/netfilter/core.c:626) [ 62.821093] br_dev_queue_push_xmit (net/bridge/br_forward.c:55) [ 62.825652] maybe_deliver (net/bridge/br_forward.c:193) [ 62.829420] br_flood (net/bridge/br_forward.c:233) [ 62.832758] br_handle_frame_finish (net/bridge/br_input.c:215) [ 62.837403] br_handle_frame (net/bridge/br_input.c:298 net/bridge/br_input.c:416) [ 62.851417] __netif_receive_skb_core.constprop.0 (net/core/dev.c:5387) [ 62.866114] __netif_receive_skb_list_core (net/core/dev.c:5570) [ 62.871367] netif_receive_skb_list_internal (net/core/dev.c:5638 net/core/dev.c:5727) [ 62.876795] napi_complete_done (./include/linux/list.h:37 ./include/net/gro.h:434 ./include/net/gro.h:429 net/core/dev.c:6067) [ 62.881004] ixgbe_poll (drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:3191) [ 62.893534] __napi_poll (net/core/dev.c:6498) [ 62.897133] napi_threaded_poll (./include/linux/netpoll.h:89 net/core/dev.c:6640) [ 62.905276] kthread (kernel/kthread.c:379) [ 62.913435] ret_from_fork (arch/x86/entry/entry_64.S:314) [ 62.917119] In the critical scenario, rx-gro-list GRO-ed packets are fed, via a bridge, both to the local input path and to an egress device (tun). The segmentation of such packets unsafely writes to the cloned skbs with shared heads. This change addresses the issue by uncloning as needed the to-be-segmented skbs. Reported-by: Ian Kumlien Tested-by: Ian Kumlien Fixes: 3a1296a38d0c ("net: Support GRO/GSO fraglist chaining.") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c5915efbc17..a298992060e6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4261,6 +4261,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(skb, -skb_network_offset(skb) + offset); + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + skb_shinfo(skb)->frag_list = NULL; while (list_skb) { -- cgit v1.2.3