diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-04-12 20:54:35 +0300 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-04-12 20:54:35 +0300 |
| commit | 05ed6c221e595dce22a3863a97d89088db1c28ee (patch) | |
| tree | 68090b21b0b675d090755436bf9326c9c3da0184 | |
| parent | 006679268a2942f897a1d601779867a8dcbb8ed0 (diff) | |
| parent | 5d3b12d1a24b72e147fbb585158f51585593f640 (diff) | |
| download | linux-05ed6c221e595dce22a3863a97d89088db1c28ee.tar.xz | |
Merge branch 'add-tso-map-once-dma-helpers-and-bnxt-sw-uso-support'
Joe Damato says:
====================
Add TSO map-once DMA helpers and bnxt SW USO support
Greetings:
This series extends net/tso to add a data structure and some helpers allowing
drivers to DMA map headers and packet payloads a single time. The helpers can
then be used to reference slices of shared mapping for each segment. This
helps to avoid the cost of repeated DMA mappings, especially on systems which
use an IOMMU. N per-packet DMA maps are replaced with a single map for the
entire GSO skb. As of v3, the series uses the DMA IOVA API (as suggested by
Leon [1]) and provides a fallback path when an IOMMU is not in use. The DMA
IOVA API provides even better efficiency than the v2; see below.
The added helpers are then used in bnxt to add support for software UDP
Segmentation Offloading (SW USO) for older bnxt devices which do not have
support for USO in hardware. Since the helpers are generic, other drivers
can be extended similarly.
The v2 showed a ~4x reduction in DMA mapping calls at the same wire packet
rate on production traffic with a bnxt device. The v3, however, shows a larger
reduction of about ~6x at the same wire packet rate. This is thanks to Leon's
suggestion of using the DMA IOVA API [1].
Special care is taken to make bnxt ethtool operations work correctly: the ring
size cannot be reduced below a minimum threshold while USO is enabled and
growing the ring automatically re-enables USO if it was previously blocked.
This v10 contains some cosmetic changes (wrapping long lines), moves the test
to the correct directory, and attempts to fix the slot availability check
added in the v9.
I re-ran the python test and the test passed on my bnxt system. I also ran
this on a production system.
====================
Link: https://patch.msgid.link/20260408230607.2019402-1-joe@dama.to
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/Makefile | 2 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.c | 183 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.h | 32 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 240 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h | 46 | ||||
| -rw-r--r-- | include/linux/skbuff.h | 11 | ||||
| -rw-r--r-- | include/net/tso.h | 100 | ||||
| -rw-r--r-- | net/core/tso.c | 269 | ||||
| -rw-r--r-- | tools/testing/selftests/drivers/net/hw/Makefile | 1 | ||||
| -rwxr-xr-x | tools/testing/selftests/drivers/net/hw/uso.py | 103 |
11 files changed, 967 insertions, 39 deletions
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index ba6c239d52fa..debef78c8b6d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o +bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o bnxt_gso.o bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fe8b886ff82e..2715632115a5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -74,6 +74,8 @@ #include "bnxt_debugfs.h" #include "bnxt_coredump.h" #include "bnxt_hwmon.h" +#include "bnxt_gso.h" +#include <net/tso.h> #define BNXT_TX_TIMEOUT (5 * HZ) #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \ @@ -447,7 +449,7 @@ const u16 bnxt_lhint_arr[] = { TX_BD_FLAGS_LHINT_2048_AND_LARGER, }; -static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); @@ -506,6 +508,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) } } #endif + if (skb_is_gso(skb) && + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) + return bnxt_sw_udp_gso_xmit(bp, txr, txq, skb); + free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { /* We must have raced with NAPI cleanup */ @@ -656,6 +663,7 @@ normal_tx: goto tx_free; dma_unmap_addr_set(tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | TX_BD_CNT(last_frag + 2); @@ -663,10 +671,9 @@ normal_tx: txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag); prod = NEXT_TX(prod); - txbd1 = (struct tx_bd_ext *) - &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1 = bnxt_init_ext_bd(bp, txr, prod, lflags, vlan_tag_flags, + cfa_action); - txbd1->tx_bd_hsize_lflags = lflags; if (skb_is_gso(skb)) { bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4); u32 hdr_len; @@ -693,7 +700,6 @@ normal_tx: } else if (skb->ip_summed == CHECKSUM_PARTIAL) { txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); - txbd1->tx_bd_mss = 0; } length >>= 9; @@ -706,9 +712,6 @@ normal_tx: flags |= bnxt_lhint_arr[length]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); - txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); - txbd1->tx_bd_cfa_action = - cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); txbd0 = txbd; for (i = 0; i < last_frag; i++) { frag = &skb_shinfo(skb)->frags[i]; @@ -725,6 +728,7 @@ normal_tx: tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; netmem_dma_unmap_addr_set(skb_frag_netmem(frag), tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); txbd->tx_bd_haddr = cpu_to_le64(mapping); @@ -814,17 +818,19 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 hw_cons = txr->tx_hw_cons; unsigned int tx_bytes = 0; u16 cons = txr->tx_cons; - skb_frag_t *frag; + unsigned int dma_len; + dma_addr_t dma_addr; int tx_pkts = 0; bool rc = false; while (RING_TX(bp, cons) != hw_cons) { - struct bnxt_sw_tx_bd *tx_buf; + struct bnxt_sw_tx_bd *tx_buf, *head_buf; struct sk_buff *skb; bool is_ts_pkt; int j, last; tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; + head_buf = tx_buf; skb = tx_buf->skb; if (unlikely(!skb)) { @@ -849,20 +855,44 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, goto next_tx_int; } - dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } + last = tx_buf->nr_frags; for (j = 0; j < last; j++) { - frag = &skb_shinfo(skb)->frags[j]; cons = NEXT_TX(cons); tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } + + if (unlikely(head_buf->is_sw_gso)) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + tx_pkts--; + tx_bytes -= skb->len; + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (unlikely(is_ts_pkt)) { if (BNXT_CHIP_P5(bp)) { /* PTP worker takes ownership of the skb */ @@ -3399,19 +3429,23 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, { int i, max_idx; struct pci_dev *pdev = bp->pdev; + unsigned int dma_len; + dma_addr_t dma_addr; max_idx = bp->tx_nr_pages * TX_DESC_CNT; for (i = 0; i < max_idx;) { struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i]; + struct bnxt_sw_tx_bd *head_buf = tx_buf; struct sk_buff *skb; int j, last; if (idx < bp->tx_nr_rings_xdp && tx_buf->action == XDP_REDIRECT) { - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - dma_unmap_len(tx_buf, len), + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, DMA_TO_DEVICE); xdp_return_frame(tx_buf->xdpf); tx_buf->action = 0; @@ -3434,25 +3468,43 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, continue; } - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), - DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } last = tx_buf->nr_frags; i += 2; for (j = 0; j < last; j++, i++) { int ring_idx = i & bp->tx_ring_mask; - skb_frag_t *frag = &skb_shinfo(skb)->frags[j]; tx_buf = &txr->tx_buf_ring[ring_idx]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } - dev_kfree_skb(skb); + if (head_buf->is_sw_gso) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (skb) + dev_kfree_skb(skb); } netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx)); } @@ -3965,6 +4017,39 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) return rc; } +static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev) +{ + if (!txr->tx_inline_buf) + return; + + dma_unmap_single(&pdev->dev, txr->tx_inline_dma, + txr->tx_inline_size, DMA_TO_DEVICE); + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + txr->tx_inline_size = 0; +} + +static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev, + unsigned int size) +{ + txr->tx_inline_buf = kmalloc(size, GFP_KERNEL); + if (!txr->tx_inline_buf) + return -ENOMEM; + + txr->tx_inline_dma = dma_map_single(&pdev->dev, txr->tx_inline_buf, + size, DMA_TO_DEVICE); + if (dma_mapping_error(&pdev->dev, txr->tx_inline_dma)) { + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + return -ENOMEM; + } + txr->tx_inline_size = size; + + return 0; +} + static void bnxt_free_tx_rings(struct bnxt *bp) { int i; @@ -3983,6 +4068,8 @@ static void bnxt_free_tx_rings(struct bnxt *bp) txr->tx_push = NULL; } + bnxt_free_tx_inline_buf(txr, pdev); + ring = &txr->tx_ring_struct; bnxt_free_ring(bp, &ring->ring_mem); @@ -4048,6 +4135,13 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp) sizeof(struct tx_push_bd); txr->data_mapping = cpu_to_le64(mapping); } + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + rc = bnxt_alloc_tx_inline_buf(txr, pdev, + BNXT_SW_USO_MAX_SEGS * + TSO_HEADER_SIZE); + if (rc) + return rc; + } qidx = bp->tc_to_qidx[j]; ring->queue_id = bp->q_info[qidx].queue_id; spin_lock_init(&txr->xdp_tx_lock); @@ -4586,10 +4680,13 @@ static int bnxt_init_rx_rings(struct bnxt *bp) static int bnxt_init_tx_rings(struct bnxt *bp) { + netdev_features_t features; u16 i; + features = bp->dev->features; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, - BNXT_MIN_TX_DESC_CNT); + bnxt_min_tx_desc_cnt(bp, features)); for (i = 0; i < bp->tx_nr_rings; i++) { struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; @@ -13788,6 +13885,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false)) features &= ~NETIF_F_NTUPLE; + if ((features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS) + features &= ~NETIF_F_GSO_UDP_L4; + if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog) features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW); @@ -13833,6 +13935,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features) int rc = 0; bool re_init = false; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, + bnxt_min_tx_desc_cnt(bp, features)); + flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS; if (features & NETIF_F_GRO_HW) flags |= BNXT_FLAG_GRO; @@ -16858,8 +16963,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH | NETIF_F_RXCSUM | NETIF_F_GRO; - if (bp->flags & BNXT_FLAG_UDP_GSO_CAP) - dev->hw_features |= NETIF_F_GSO_UDP_L4; + dev->hw_features |= NETIF_F_GSO_UDP_L4; if (BNXT_SUPPORTS_TPA(bp)) dev->hw_features |= NETIF_F_LRO; @@ -16892,8 +16996,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - if (bp->tso_max_segs) + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + u16 max_segs = BNXT_SW_USO_MAX_SEGS; + + if (bp->tso_max_segs) + max_segs = min_t(u16, max_segs, bp->tso_max_segs); + netif_set_tso_max_segs(dev, max_segs); + } else if (bp->tso_max_segs) { netif_set_tso_max_segs(dev, bp->tso_max_segs); + } dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3558a36ece12..fe50576ae525 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -11,6 +11,8 @@ #ifndef BNXT_H #define BNXT_H +#include <net/tso.h> + #define DRV_MODULE_NAME "bnxt_en" /* DO NOT CHANGE DRV_VER_* defines @@ -892,14 +894,19 @@ struct bnxt_sw_tx_bd { struct page *page; u8 is_ts_pkt; u8 is_push; + u8 is_sw_gso; u8 action; unsigned short nr_frags; union { u16 rx_prod; u16 txts_prod; }; + struct tso_dma_map_completion_state sw_gso_cstate; }; +#define BNXT_SW_GSO_MID 1 +#define BNXT_SW_GSO_LAST 2 + struct bnxt_sw_rx_bd { void *data; u8 *data_ptr; @@ -996,6 +1003,12 @@ struct bnxt_tx_ring_info { dma_addr_t tx_push_mapping; __le64 data_mapping; + void *tx_inline_buf; + dma_addr_t tx_inline_dma; + unsigned int tx_inline_size; + u16 tx_inline_prod; + u16 tx_inline_cons; + #define BNXT_DEV_STATE_CLOSING 0x1 u32 dev_state; @@ -2836,6 +2849,24 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp, return bp->tx_ring_size - (used & bp->tx_ring_mask); } +static inline struct tx_bd_ext * +bnxt_init_ext_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, + u16 prod, __le32 lflags, u32 vlan_tag_flags, + u32 cfa_action) +{ + struct tx_bd_ext *txbd1; + + txbd1 = (struct tx_bd_ext *) + &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1->tx_bd_hsize_lflags = lflags; + txbd1->tx_bd_mss = 0; + txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); + txbd1->tx_bd_cfa_action = + cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); + + return txbd1; +} + static inline void bnxt_writeq(struct bnxt *bp, u64 val, volatile void __iomem *addr) { @@ -2969,6 +3000,7 @@ unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp); int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init); void bnxt_tx_disable(struct bnxt *bp); void bnxt_tx_enable(struct bnxt *bp); +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb); void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 curr); void bnxt_report_link(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 6826bf762d26..9ded88196bb4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -33,6 +33,7 @@ #include "bnxt_xdp.h" #include "bnxt_ptp.h" #include "bnxt_ethtool.h" +#include "bnxt_gso.h" #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */ #include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */ #include "bnxt_coredump.h" @@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev, u8 tcp_data_split = kernel_ering->tcp_data_split; struct bnxt *bp = netdev_priv(dev); u8 hds_config_mod; + int rc; if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; + if ((dev->features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS) + return -EINVAL; + hds_config_mod = tcp_data_split != dev->cfg->hds_config; if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod) return -EINVAL; @@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev, bp->tx_ring_size = ering->tx_pending; bnxt_set_ring_params(bp); - if (netif_running(dev)) - return bnxt_open_nic(bp, false, false); + if (netif_running(dev)) { + rc = bnxt_open_nic(bp, false, false); + if (rc) + return rc; + } + /* ring size changes may affect features (SW USO requires a minimum + * ring size), so recalculate features to ensure the correct features + * are blocked/available. + */ + netdev_update_features(dev); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c new file mode 100644 index 000000000000..f317f60414e8 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/netdev_queues.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/tso.h> +#include <linux/bnxt/hsi.h> + +#include "bnxt.h" +#include "bnxt_gso.h" + +static u32 bnxt_sw_gso_lhint(unsigned int len) +{ + if (len <= 512) + return TX_BD_FLAGS_LHINT_512_AND_SMALLER; + else if (len <= 1023) + return TX_BD_FLAGS_LHINT_512_TO_1023; + else if (len <= 2047) + return TX_BD_FLAGS_LHINT_1024_TO_2047; + else + return TX_BD_FLAGS_LHINT_2048_AND_LARGER; +} + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb) +{ + unsigned int last_unmap_len __maybe_unused = 0; + dma_addr_t last_unmap_addr __maybe_unused = 0; + struct bnxt_sw_tx_bd *last_unmap_buf = NULL; + unsigned int hdr_len, mss, num_segs; + struct pci_dev *pdev = bp->pdev; + unsigned int total_payload; + struct tso_dma_map map; + u32 vlan_tag_flags = 0; + int i, bds_needed; + struct tso_t tso; + u16 cfa_action; + __le32 csum; + u16 prod; + + hdr_len = tso_start(skb, &tso); + mss = skb_shinfo(skb)->gso_size; + total_payload = skb->len - hdr_len; + num_segs = DIV_ROUND_UP(total_payload, mss); + + if (unlikely(num_segs <= 1)) + goto drop; + + /* Upper bound on the number of descriptors needed. + * + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is + * at most num_segs + nr_frags (each frag boundary crossing adds at + * most 1 extra BD). + */ + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1; + + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) { + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + return NETDEV_TX_BUSY; + } + + /* BD backpressure alone cannot prevent overwriting in-flight + * headers in the inline buffer. Check slot availability directly. + */ + if (!netif_txq_maybe_stop(txq, bnxt_inline_avail(txr), + num_segs, num_segs)) + return NETDEV_TX_BUSY; + + if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len))) + goto drop; + + cfa_action = bnxt_xmit_get_cfa_action(skb); + if (skb_vlan_tag_present(skb)) { + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN | + skb_vlan_tag_get(skb); + if (skb->vlan_proto == htons(ETH_P_8021Q)) + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; + } + + csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); + if (!tso.ipv6) + csum |= cpu_to_le32(TX_BD_FLAGS_IP_CKSUM); + + prod = txr->tx_prod; + + for (i = 0; i < num_segs; i++) { + unsigned int seg_payload = min_t(unsigned int, mss, + total_payload - i * mss); + u16 slot = (txr->tx_inline_prod + i) & + (BNXT_SW_USO_MAX_SEGS - 1); + struct bnxt_sw_tx_bd *tx_buf; + unsigned int mapping_len; + dma_addr_t this_hdr_dma; + unsigned int chunk_len; + unsigned int offset; + dma_addr_t dma_addr; + struct tx_bd *txbd; + struct udphdr *uh; + void *this_hdr; + int bd_count; + bool last; + u32 flags; + + last = (i == num_segs - 1); + offset = slot * TSO_HEADER_SIZE; + this_hdr = txr->tx_inline_buf + offset; + this_hdr_dma = txr->tx_inline_dma + offset; + + tso_build_hdr(skb, this_hdr, &tso, seg_payload, last); + + /* Zero stale csum fields copied from the original skb; + * HW offload recomputes from scratch. + */ + uh = this_hdr + skb_transport_offset(skb); + uh->check = 0; + if (!tso.ipv6) { + struct iphdr *iph = this_hdr + skb_network_offset(skb); + + iph->check = 0; + } + + dma_sync_single_for_device(&pdev->dev, this_hdr_dma, + hdr_len, DMA_TO_DEVICE); + + bd_count = tso_dma_map_count(&map, seg_payload); + + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + + tx_buf->skb = skb; + tx_buf->nr_frags = bd_count; + tx_buf->is_push = 0; + tx_buf->is_ts_pkt = 0; + + dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma); + dma_unmap_len_set(tx_buf, len, 0); + + if (last) { + tx_buf->is_sw_gso = BNXT_SW_GSO_LAST; + tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate); + } else { + tx_buf->is_sw_gso = BNXT_SW_GSO_MID; + } + + flags = (hdr_len << TX_BD_LEN_SHIFT) | + TX_BD_TYPE_LONG_TX_BD | + TX_BD_CNT(2 + bd_count); + + flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload); + + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma); + txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, + 2 + bd_count); + + prod = NEXT_TX(prod); + bnxt_init_ext_bd(bp, txr, prod, csum, + vlan_tag_flags, cfa_action); + + /* set dma_unmap_len on the LAST BD touching each + * region. Since completions are in-order, the last segment + * completes after all earlier ones, so the unmap is safe. + */ + while (tso_dma_map_next(&map, &dma_addr, &chunk_len, + &mapping_len, seg_payload)) { + prod = NEXT_TX(prod); + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + + txbd->tx_bd_haddr = cpu_to_le64(dma_addr); + dma_unmap_addr_set(tx_buf, mapping, dma_addr); + dma_unmap_len_set(tx_buf, len, 0); + tx_buf->skb = NULL; + tx_buf->is_sw_gso = 0; + + if (mapping_len) { + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, + mapping, + last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, + len, + last_unmap_len); + } + last_unmap_addr = dma_addr; + last_unmap_len = mapping_len; + } + last_unmap_buf = tx_buf; + + flags = chunk_len << TX_BD_LEN_SHIFT; + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_opaque = 0; + + seg_payload -= chunk_len; + } + + txbd->tx_bd_len_flags_type |= + cpu_to_le32(TX_BD_FLAGS_PACKET_END); + + prod = NEXT_TX(prod); + } + + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, len, last_unmap_len); + } + + txr->tx_inline_prod += num_segs; + + netdev_tx_sent_queue(txq, skb->len); + + WRITE_ONCE(txr->tx_prod, prod); + /* Sync BDs before doorbell */ + wmb(); + bnxt_db_write(bp, &txr->tx_db, prod); + + if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh)) + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + + return NETDEV_TX_OK; + +drop: + dev_kfree_skb_any(skb); + dev_core_stats_tx_dropped_inc(bp->dev); + return NETDEV_TX_OK; +} diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h new file mode 100644 index 000000000000..47528c20f311 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#ifndef BNXT_GSO_H +#define BNXT_GSO_H + +/* Maximum segments the stack may send in a single SW USO skb. + * This caps gso_max_segs for NICs without HW USO support. + */ +#define BNXT_SW_USO_MAX_SEGS 64 + +/* Worst-case TX descriptors consumed by one SW USO packet: + * Each segment: 1 long BD + 1 ext BD + payload BDs. + * Total payload BDs across all segs <= num_segs + nr_frags (each frag + * boundary crossing adds at most 1 extra BD). + * So: 3 * max_segs + MAX_SKB_FRAGS + 1 = 3 * 64 + 17 + 1 = 210. + */ +#define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1) + +static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr) +{ + return BNXT_SW_USO_MAX_SEGS - + (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons)); +} + +static inline int bnxt_min_tx_desc_cnt(struct bnxt *bp, + netdev_features_t features) +{ + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + (features & NETIF_F_GSO_UDP_L4)) + return BNXT_SW_USO_MAX_DESCS; + return BNXT_MIN_TX_DESC_CNT; +} + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb); + +#endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 26fe18bcfad8..2bcf78a4de7b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3764,6 +3764,17 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) } /** + * skb_frag_phys - gets the physical address of the data in a paged fragment + * @frag: the paged fragment buffer + * + * Returns: the physical address of the data within @frag. + */ +static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag) +{ + return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag); +} + +/** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from diff --git a/include/net/tso.h b/include/net/tso.h index e7e157ae0526..da82aabd1d48 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -3,6 +3,7 @@ #define _TSO_H #include <linux/skbuff.h> +#include <linux/dma-mapping.h> #include <net/ip.h> #define TSO_HEADER_SIZE 256 @@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); int tso_start(struct sk_buff *skb, struct tso_t *tso); +/** + * struct tso_dma_map - DMA mapping state for GSO payload + * @dev: device used for DMA mapping + * @skb: the GSO skb being mapped + * @hdr_len: per-segment header length + * @iova_state: DMA IOVA state (when IOMMU available) + * @iova_offset: global byte offset into IOVA range (IOVA path only) + * @total_len: total payload length + * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag) + * @offset: byte offset within current region + * @linear_dma: DMA address of the linear payload + * @linear_len: length of the linear payload + * @nr_frags: number of frags successfully DMA-mapped + * @frags: per-frag DMA address and length + * + * DMA-maps the payload regions of a GSO skb (linear data + frags). + * Prefers the DMA IOVA API for a single contiguous mapping with one + * IOTLB sync; falls back to per-region dma_map_phys() otherwise. + */ +struct tso_dma_map { + struct device *dev; + const struct sk_buff *skb; + unsigned int hdr_len; + /* IOVA path */ + struct dma_iova_state iova_state; + size_t iova_offset; + size_t total_len; + /* Fallback path if IOVA path fails */ + int frag_idx; + unsigned int offset; + dma_addr_t linear_dma; + unsigned int linear_len; + unsigned int nr_frags; + struct { + dma_addr_t dma; + unsigned int len; + } frags[MAX_SKB_FRAGS]; +}; + +/** + * struct tso_dma_map_completion_state - Completion-time cleanup state + * @iova_state: DMA IOVA state (when IOMMU available) + * @total_len: total payload length of the IOVA mapping + * + * Drivers store this on their SW ring at xmit time via + * tso_dma_map_completion_save(), then call tso_dma_map_complete() at + * completion time. + */ +struct tso_dma_map_completion_state { + struct dma_iova_state iova_state; + size_t total_len; +}; + +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len); +void tso_dma_map_cleanup(struct tso_dma_map *map); +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len); +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining); + +/** + * tso_dma_map_completion_save - save state needed for completion-time cleanup + * @map: the xmit-time DMA map + * @cstate: driver-owned storage that persists until completion + * + * Should be called at xmit time to update the completion state and later passed + * to tso_dma_map_complete(). + */ +static inline void +tso_dma_map_completion_save(const struct tso_dma_map *map, + struct tso_dma_map_completion_state *cstate) +{ + cstate->iova_state = map->iova_state; + cstate->total_len = map->total_len; +} + +/** + * tso_dma_map_complete - tear down mapping at completion time + * @dev: the device that owns the mapping + * @cstate: state saved by tso_dma_map_completion_save() + * + * Return: true if the IOVA path was used and the mapping has been + * destroyed; false if the fallback per-region path was used and the + * driver must unmap via its normal completion path. + */ +static inline bool +tso_dma_map_complete(struct device *dev, + struct tso_dma_map_completion_state *cstate) +{ + if (dma_use_iova(&cstate->iova_state)) { + dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len, + DMA_TO_DEVICE, 0); + return true; + } + + return false; +} + #endif /* _TSO_H */ diff --git a/net/core/tso.c b/net/core/tso.c index 6df997b9076e..347b3856ddb9 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -3,6 +3,7 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/tso.h> +#include <linux/dma-mapping.h> #include <linux/unaligned.h> void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, @@ -87,3 +88,271 @@ int tso_start(struct sk_buff *skb, struct tso_t *tso) return hdr_len; } EXPORT_SYMBOL(tso_start); + +static int tso_dma_iova_try(struct device *dev, struct tso_dma_map *map, + phys_addr_t phys, size_t linear_len, + size_t total_len, size_t *offset) +{ + const struct sk_buff *skb; + unsigned int nr_frags; + int i; + + if (!dma_iova_try_alloc(dev, &map->iova_state, phys, total_len)) + return 1; + + skb = map->skb; + nr_frags = skb_shinfo(skb)->nr_frags; + + if (linear_len) { + if (dma_iova_link(dev, &map->iova_state, + phys, *offset, linear_len, + DMA_TO_DEVICE, 0)) + goto iova_fail; + map->linear_len = linear_len; + *offset += linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + if (dma_iova_link(dev, &map->iova_state, + skb_frag_phys(frag), *offset, + frag_len, DMA_TO_DEVICE, 0)) { + map->nr_frags = i; + goto iova_fail; + } + map->frags[i].len = frag_len; + *offset += frag_len; + map->nr_frags = i + 1; + } + + if (dma_iova_sync(dev, &map->iova_state, 0, total_len)) + goto iova_fail; + + return 0; + +iova_fail: + dma_iova_destroy(dev, &map->iova_state, *offset, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + /* reset map state */ + map->frag_idx = -1; + map->offset = 0; + map->linear_len = 0; + map->nr_frags = 0; + + return 1; +} + +/** + * tso_dma_map_init - DMA-map GSO payload regions + * @map: map struct to initialize + * @dev: device for DMA mapping + * @skb: the GSO skb + * @hdr_len: per-segment header length in bytes + * + * DMA-maps the linear payload (after headers) and all frags. + * Prefers the DMA IOVA API (one contiguous mapping, one IOTLB sync); + * falls back to per-region dma_map_phys() when IOVA is not available. + * Positions the iterator at byte 0 of the payload. + * + * Return: 0 on success, -ENOMEM on DMA mapping failure (partial mappings + * are cleaned up internally). + */ +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len) +{ + unsigned int linear_len = skb_headlen(skb) - hdr_len; + unsigned int nr_frags = skb_shinfo(skb)->nr_frags; + size_t total_len = skb->len - hdr_len; + size_t offset = 0; + phys_addr_t phys; + int i; + + map->dev = dev; + map->skb = skb; + map->hdr_len = hdr_len; + map->frag_idx = -1; + map->offset = 0; + map->iova_offset = 0; + map->total_len = total_len; + map->linear_len = 0; + map->nr_frags = 0; + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + if (!total_len) + return 0; + + if (linear_len) + phys = virt_to_phys(skb->data + hdr_len); + else + phys = skb_frag_phys(&skb_shinfo(skb)->frags[0]); + + if (tso_dma_iova_try(dev, map, phys, linear_len, total_len, &offset)) { + /* IOVA path failed, map state was reset. Fallback to + * per-region dma_map_phys() + */ + if (linear_len) { + map->linear_dma = dma_map_phys(dev, phys, linear_len, + DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->linear_dma)) + return -ENOMEM; + map->linear_len = linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + map->frags[i].len = frag_len; + map->frags[i].dma = dma_map_phys(dev, skb_frag_phys(frag), + frag_len, DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->frags[i].dma)) { + tso_dma_map_cleanup(map); + return -ENOMEM; + } + map->nr_frags = i + 1; + } + } + + if (linear_len == 0 && nr_frags > 0) + map->frag_idx = 0; + + return 0; +} +EXPORT_SYMBOL(tso_dma_map_init); + +/** + * tso_dma_map_cleanup - unmap all DMA regions in a tso_dma_map + * @map: the map to clean up + * + * Handles both IOVA and fallback paths. For IOVA, calls + * dma_iova_destroy(). For fallback, unmaps each region individually. + */ +void tso_dma_map_cleanup(struct tso_dma_map *map) +{ + int i; + + if (dma_use_iova(&map->iova_state)) { + dma_iova_destroy(map->dev, &map->iova_state, map->total_len, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + } else { + if (map->linear_len) + dma_unmap_phys(map->dev, map->linear_dma, + map->linear_len, DMA_TO_DEVICE, 0); + + for (i = 0; i < map->nr_frags; i++) + dma_unmap_phys(map->dev, map->frags[i].dma, + map->frags[i].len, DMA_TO_DEVICE, 0); + } + + map->linear_len = 0; + map->nr_frags = 0; +} +EXPORT_SYMBOL(tso_dma_map_cleanup); + +/** + * tso_dma_map_count - count descriptors for a payload range + * @map: the payload map + * @len: number of payload bytes in this segment + * + * Counts how many contiguous DMA region chunks the next @len bytes + * will span, without advancing the iterator. On the IOVA path this + * is always 1 (contiguous). On the fallback path, uses region sizes + * from the current position. + * + * Return: the number of descriptors needed for @len bytes of payload. + */ +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len) +{ + unsigned int offset = map->offset; + int idx = map->frag_idx; + unsigned int count = 0; + + if (!len) + return 0; + + if (dma_use_iova(&map->iova_state)) + return 1; + + while (len > 0) { + unsigned int region_len, chunk; + + if (idx == -1) + region_len = map->linear_len; + else + region_len = map->frags[idx].len; + + chunk = min(len, region_len - offset); + len -= chunk; + count++; + offset = 0; + idx++; + } + + return count; +} +EXPORT_SYMBOL(tso_dma_map_count); + +/** + * tso_dma_map_next - yield the next DMA address range + * @map: the payload map + * @addr: output DMA address + * @chunk_len: output chunk length + * @mapping_len: full DMA mapping length when this chunk starts a new + * mapping region, or 0 when continuing a previous one. + * On the IOVA path this is always 0 (driver must not + * do per-region unmaps; use tso_dma_map_cleanup instead). + * @seg_remaining: bytes left in current segment + * + * Yields the next (dma_addr, chunk_len) pair and advances the iterator. + * On the IOVA path, the entire payload is contiguous so each segment + * is always a single chunk. + * + * Return: true if a chunk was yielded, false when @seg_remaining is 0. + */ +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining) +{ + unsigned int region_len, chunk; + + if (!seg_remaining) + return false; + + /* IOVA path: contiguous DMA range, no region boundaries */ + if (dma_use_iova(&map->iova_state)) { + *addr = map->iova_state.addr + map->iova_offset; + *chunk_len = seg_remaining; + *mapping_len = 0; + map->iova_offset += seg_remaining; + return true; + } + + /* Fallback path: per-region iteration */ + + if (map->frag_idx == -1) { + region_len = map->linear_len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->linear_dma + map->offset; + } else { + region_len = map->frags[map->frag_idx].len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->frags[map->frag_idx].dma + map->offset; + } + + *mapping_len = (map->offset == 0) ? region_len : 0; + *chunk_len = chunk; + map->offset += chunk; + + if (map->offset >= region_len) { + map->frag_idx++; + map->offset = 0; + } + + return true; +} +EXPORT_SYMBOL(tso_dma_map_next); diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index d84e955ac87b..85ca4d1ecf9e 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -45,6 +45,7 @@ TEST_PROGS = \ rss_input_xfrm.py \ toeplitz.py \ tso.py \ + uso.py \ xdp_metadata.py \ xsk_reconfig.py \ # diff --git a/tools/testing/selftests/drivers/net/hw/uso.py b/tools/testing/selftests/drivers/net/hw/uso.py new file mode 100755 index 000000000000..6d61e56cab3c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/uso.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Test USO + +Sends large UDP datagrams with UDP_SEGMENT and verifies that the peer +receives the expected total payload and that the NIC transmitted at least +the expected number of segments. +""" +import random +import socket +import string + +from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_eq, ksft_ge, ksft_variants, KsftNamedVariant +from lib.py import NetDrvEpEnv +from lib.py import bkg, defer, ethtool, ip, rand_port, wait_port_listen + +# python doesn't expose this constant, so we need to hardcode it to enable UDP +# segmentation for large payloads +UDP_SEGMENT = 103 + + +def _send_uso(cfg, ipver, mss, total_payload, port): + if ipver == "4": + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["4"], port) + else: + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["6"], port) + + sock.setsockopt(socket.IPPROTO_UDP, UDP_SEGMENT, mss) + payload = ''.join(random.choice(string.ascii_lowercase) + for _ in range(total_payload)) + sock.sendto(payload.encode(), dst) + sock.close() + + +def _get_tx_packets(cfg): + stats = ip(f"-s link show dev {cfg.ifname}", json=True)[0] + return stats['stats64']['tx']['packets'] + + +def _test_uso(cfg, ipver, mss, total_payload): + cfg.require_ipver(ipver) + cfg.require_cmd("socat", remote=True) + + features = ethtool(f"-k {cfg.ifname}", json=True) + uso_was_on = features[0]["tx-udp-segmentation"]["active"] + + try: + ethtool(f"-K {cfg.ifname} tx-udp-segmentation on") + except Exception as exc: + raise KsftSkipEx( + "Device does not support tx-udp-segmentation") from exc + if not uso_was_on: + defer(ethtool, f"-K {cfg.ifname} tx-udp-segmentation off") + + expected_segs = (total_payload + mss - 1) // mss + + port = rand_port(stype=socket.SOCK_DGRAM) + rx_cmd = f"socat -{ipver} -T 2 -u UDP-LISTEN:{port},reuseport STDOUT" + + tx_before = _get_tx_packets(cfg) + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True) as rx: + wait_port_listen(port, proto="udp", host=cfg.remote) + _send_uso(cfg, ipver, mss, total_payload, port) + + ksft_eq(len(rx.stdout), total_payload, + comment=f"Received {len(rx.stdout)}B, expected {total_payload}B") + + cfg.wait_hw_stats_settle() + + tx_after = _get_tx_packets(cfg) + tx_delta = tx_after - tx_before + + ksft_ge(tx_delta, expected_segs, + comment=f"Expected >= {expected_segs} tx packets, got {tx_delta}") + + +def _uso_variants(): + for ipver in ["4", "6"]: + yield KsftNamedVariant(f"v{ipver}_partial", ipver, 1400, 1400 * 10 + 500) + yield KsftNamedVariant(f"v{ipver}_exact", ipver, 1400, 1400 * 5) + + +@ksft_variants(_uso_variants()) +def test_uso(cfg, ipver, mss, total_payload): + """Send a USO datagram and verify the peer receives the expected segments.""" + _test_uso(cfg, ipver, mss, total_payload) + + +def main() -> None: + """Run USO tests.""" + with NetDrvEpEnv(__file__) as cfg: + ksft_run([test_uso], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() |
