diff options
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/Makefile | 2 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.c | 183 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.h | 32 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 240 | ||||
| -rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h | 46 | ||||
| -rw-r--r-- | include/linux/skbuff.h | 11 | ||||
| -rw-r--r-- | include/net/tso.h | 100 | ||||
| -rw-r--r-- | net/core/tso.c | 269 | ||||
| -rw-r--r-- | tools/testing/selftests/drivers/net/hw/Makefile | 1 | ||||
| -rwxr-xr-x | tools/testing/selftests/drivers/net/hw/uso.py | 103 |
11 files changed, 967 insertions, 39 deletions
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index ba6c239d52fa..debef78c8b6d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o +bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o bnxt_gso.o bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fe8b886ff82e..2715632115a5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -74,6 +74,8 @@ #include "bnxt_debugfs.h" #include "bnxt_coredump.h" #include "bnxt_hwmon.h" +#include "bnxt_gso.h" +#include <net/tso.h> #define BNXT_TX_TIMEOUT (5 * HZ) #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \ @@ -447,7 +449,7 @@ const u16 bnxt_lhint_arr[] = { TX_BD_FLAGS_LHINT_2048_AND_LARGER, }; -static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); @@ -506,6 +508,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) } } #endif + if (skb_is_gso(skb) && + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) + return bnxt_sw_udp_gso_xmit(bp, txr, txq, skb); + free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { /* We must have raced with NAPI cleanup */ @@ -656,6 +663,7 @@ normal_tx: goto tx_free; dma_unmap_addr_set(tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | TX_BD_CNT(last_frag + 2); @@ -663,10 +671,9 @@ normal_tx: txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag); prod = NEXT_TX(prod); - txbd1 = (struct tx_bd_ext *) - &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1 = bnxt_init_ext_bd(bp, txr, prod, lflags, vlan_tag_flags, + cfa_action); - txbd1->tx_bd_hsize_lflags = lflags; if (skb_is_gso(skb)) { bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4); u32 hdr_len; @@ -693,7 +700,6 @@ normal_tx: } else if (skb->ip_summed == CHECKSUM_PARTIAL) { txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); - txbd1->tx_bd_mss = 0; } length >>= 9; @@ -706,9 +712,6 @@ normal_tx: flags |= bnxt_lhint_arr[length]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); - txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); - txbd1->tx_bd_cfa_action = - cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); txbd0 = txbd; for (i = 0; i < last_frag; i++) { frag = &skb_shinfo(skb)->frags[i]; @@ -725,6 +728,7 @@ normal_tx: tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; netmem_dma_unmap_addr_set(skb_frag_netmem(frag), tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); txbd->tx_bd_haddr = cpu_to_le64(mapping); @@ -814,17 +818,19 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 hw_cons = txr->tx_hw_cons; unsigned int tx_bytes = 0; u16 cons = txr->tx_cons; - skb_frag_t *frag; + unsigned int dma_len; + dma_addr_t dma_addr; int tx_pkts = 0; bool rc = false; while (RING_TX(bp, cons) != hw_cons) { - struct bnxt_sw_tx_bd *tx_buf; + struct bnxt_sw_tx_bd *tx_buf, *head_buf; struct sk_buff *skb; bool is_ts_pkt; int j, last; tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; + head_buf = tx_buf; skb = tx_buf->skb; if (unlikely(!skb)) { @@ -849,20 +855,44 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, goto next_tx_int; } - dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } + last = tx_buf->nr_frags; for (j = 0; j < last; j++) { - frag = &skb_shinfo(skb)->frags[j]; cons = NEXT_TX(cons); tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } + + if (unlikely(head_buf->is_sw_gso)) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + tx_pkts--; + tx_bytes -= skb->len; + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (unlikely(is_ts_pkt)) { if (BNXT_CHIP_P5(bp)) { /* PTP worker takes ownership of the skb */ @@ -3399,19 +3429,23 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, { int i, max_idx; struct pci_dev *pdev = bp->pdev; + unsigned int dma_len; + dma_addr_t dma_addr; max_idx = bp->tx_nr_pages * TX_DESC_CNT; for (i = 0; i < max_idx;) { struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i]; + struct bnxt_sw_tx_bd *head_buf = tx_buf; struct sk_buff *skb; int j, last; if (idx < bp->tx_nr_rings_xdp && tx_buf->action == XDP_REDIRECT) { - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - dma_unmap_len(tx_buf, len), + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, DMA_TO_DEVICE); xdp_return_frame(tx_buf->xdpf); tx_buf->action = 0; @@ -3434,25 +3468,43 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, continue; } - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), - DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } last = tx_buf->nr_frags; i += 2; for (j = 0; j < last; j++, i++) { int ring_idx = i & bp->tx_ring_mask; - skb_frag_t *frag = &skb_shinfo(skb)->frags[j]; tx_buf = &txr->tx_buf_ring[ring_idx]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } - dev_kfree_skb(skb); + if (head_buf->is_sw_gso) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (skb) + dev_kfree_skb(skb); } netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx)); } @@ -3965,6 +4017,39 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) return rc; } +static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev) +{ + if (!txr->tx_inline_buf) + return; + + dma_unmap_single(&pdev->dev, txr->tx_inline_dma, + txr->tx_inline_size, DMA_TO_DEVICE); + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + txr->tx_inline_size = 0; +} + +static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev, + unsigned int size) +{ + txr->tx_inline_buf = kmalloc(size, GFP_KERNEL); + if (!txr->tx_inline_buf) + return -ENOMEM; + + txr->tx_inline_dma = dma_map_single(&pdev->dev, txr->tx_inline_buf, + size, DMA_TO_DEVICE); + if (dma_mapping_error(&pdev->dev, txr->tx_inline_dma)) { + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + return -ENOMEM; + } + txr->tx_inline_size = size; + + return 0; +} + static void bnxt_free_tx_rings(struct bnxt *bp) { int i; @@ -3983,6 +4068,8 @@ static void bnxt_free_tx_rings(struct bnxt *bp) txr->tx_push = NULL; } + bnxt_free_tx_inline_buf(txr, pdev); + ring = &txr->tx_ring_struct; bnxt_free_ring(bp, &ring->ring_mem); @@ -4048,6 +4135,13 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp) sizeof(struct tx_push_bd); txr->data_mapping = cpu_to_le64(mapping); } + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + rc = bnxt_alloc_tx_inline_buf(txr, pdev, + BNXT_SW_USO_MAX_SEGS * + TSO_HEADER_SIZE); + if (rc) + return rc; + } qidx = bp->tc_to_qidx[j]; ring->queue_id = bp->q_info[qidx].queue_id; spin_lock_init(&txr->xdp_tx_lock); @@ -4586,10 +4680,13 @@ static int bnxt_init_rx_rings(struct bnxt *bp) static int bnxt_init_tx_rings(struct bnxt *bp) { + netdev_features_t features; u16 i; + features = bp->dev->features; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, - BNXT_MIN_TX_DESC_CNT); + bnxt_min_tx_desc_cnt(bp, features)); for (i = 0; i < bp->tx_nr_rings; i++) { struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; @@ -13788,6 +13885,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false)) features &= ~NETIF_F_NTUPLE; + if ((features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS) + features &= ~NETIF_F_GSO_UDP_L4; + if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog) features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW); @@ -13833,6 +13935,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features) int rc = 0; bool re_init = false; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, + bnxt_min_tx_desc_cnt(bp, features)); + flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS; if (features & NETIF_F_GRO_HW) flags |= BNXT_FLAG_GRO; @@ -16858,8 +16963,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH | NETIF_F_RXCSUM | NETIF_F_GRO; - if (bp->flags & BNXT_FLAG_UDP_GSO_CAP) - dev->hw_features |= NETIF_F_GSO_UDP_L4; + dev->hw_features |= NETIF_F_GSO_UDP_L4; if (BNXT_SUPPORTS_TPA(bp)) dev->hw_features |= NETIF_F_LRO; @@ -16892,8 +16996,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - if (bp->tso_max_segs) + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + u16 max_segs = BNXT_SW_USO_MAX_SEGS; + + if (bp->tso_max_segs) + max_segs = min_t(u16, max_segs, bp->tso_max_segs); + netif_set_tso_max_segs(dev, max_segs); + } else if (bp->tso_max_segs) { netif_set_tso_max_segs(dev, bp->tso_max_segs); + } dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3558a36ece12..fe50576ae525 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -11,6 +11,8 @@ #ifndef BNXT_H #define BNXT_H +#include <net/tso.h> + #define DRV_MODULE_NAME "bnxt_en" /* DO NOT CHANGE DRV_VER_* defines @@ -892,14 +894,19 @@ struct bnxt_sw_tx_bd { struct page *page; u8 is_ts_pkt; u8 is_push; + u8 is_sw_gso; u8 action; unsigned short nr_frags; union { u16 rx_prod; u16 txts_prod; }; + struct tso_dma_map_completion_state sw_gso_cstate; }; +#define BNXT_SW_GSO_MID 1 +#define BNXT_SW_GSO_LAST 2 + struct bnxt_sw_rx_bd { void *data; u8 *data_ptr; @@ -996,6 +1003,12 @@ struct bnxt_tx_ring_info { dma_addr_t tx_push_mapping; __le64 data_mapping; + void *tx_inline_buf; + dma_addr_t tx_inline_dma; + unsigned int tx_inline_size; + u16 tx_inline_prod; + u16 tx_inline_cons; + #define BNXT_DEV_STATE_CLOSING 0x1 u32 dev_state; @@ -2836,6 +2849,24 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp, return bp->tx_ring_size - (used & bp->tx_ring_mask); } +static inline struct tx_bd_ext * +bnxt_init_ext_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, + u16 prod, __le32 lflags, u32 vlan_tag_flags, + u32 cfa_action) +{ + struct tx_bd_ext *txbd1; + + txbd1 = (struct tx_bd_ext *) + &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1->tx_bd_hsize_lflags = lflags; + txbd1->tx_bd_mss = 0; + txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); + txbd1->tx_bd_cfa_action = + cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); + + return txbd1; +} + static inline void bnxt_writeq(struct bnxt *bp, u64 val, volatile void __iomem *addr) { @@ -2969,6 +3000,7 @@ unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp); int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init); void bnxt_tx_disable(struct bnxt *bp); void bnxt_tx_enable(struct bnxt *bp); +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb); void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 curr); void bnxt_report_link(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 6826bf762d26..9ded88196bb4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -33,6 +33,7 @@ #include "bnxt_xdp.h" #include "bnxt_ptp.h" #include "bnxt_ethtool.h" +#include "bnxt_gso.h" #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */ #include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */ #include "bnxt_coredump.h" @@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev, u8 tcp_data_split = kernel_ering->tcp_data_split; struct bnxt *bp = netdev_priv(dev); u8 hds_config_mod; + int rc; if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; + if ((dev->features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS) + return -EINVAL; + hds_config_mod = tcp_data_split != dev->cfg->hds_config; if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod) return -EINVAL; @@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev, bp->tx_ring_size = ering->tx_pending; bnxt_set_ring_params(bp); - if (netif_running(dev)) - return bnxt_open_nic(bp, false, false); + if (netif_running(dev)) { + rc = bnxt_open_nic(bp, false, false); + if (rc) + return rc; + } + /* ring size changes may affect features (SW USO requires a minimum + * ring size), so recalculate features to ensure the correct features + * are blocked/available. + */ + netdev_update_features(dev); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c new file mode 100644 index 000000000000..f317f60414e8 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/netdev_queues.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/tso.h> +#include <linux/bnxt/hsi.h> + +#include "bnxt.h" +#include "bnxt_gso.h" + +static u32 bnxt_sw_gso_lhint(unsigned int len) +{ + if (len <= 512) + return TX_BD_FLAGS_LHINT_512_AND_SMALLER; + else if (len <= 1023) + return TX_BD_FLAGS_LHINT_512_TO_1023; + else if (len <= 2047) + return TX_BD_FLAGS_LHINT_1024_TO_2047; + else + return TX_BD_FLAGS_LHINT_2048_AND_LARGER; +} + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb) +{ + unsigned int last_unmap_len __maybe_unused = 0; + dma_addr_t last_unmap_addr __maybe_unused = 0; + struct bnxt_sw_tx_bd *last_unmap_buf = NULL; + unsigned int hdr_len, mss, num_segs; + struct pci_dev *pdev = bp->pdev; + unsigned int total_payload; + struct tso_dma_map map; + u32 vlan_tag_flags = 0; + int i, bds_needed; + struct tso_t tso; + u16 cfa_action; + __le32 csum; + u16 prod; + + hdr_len = tso_start(skb, &tso); + mss = skb_shinfo(skb)->gso_size; + total_payload = skb->len - hdr_len; + num_segs = DIV_ROUND_UP(total_payload, mss); + + if (unlikely(num_segs <= 1)) + goto drop; + + /* Upper bound on the number of descriptors needed. + * + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is + * at most num_segs + nr_frags (each frag boundary crossing adds at + * most 1 extra BD). + */ + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1; + + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) { + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + return NETDEV_TX_BUSY; + } + + /* BD backpressure alone cannot prevent overwriting in-flight + * headers in the inline buffer. Check slot availability directly. + */ + if (!netif_txq_maybe_stop(txq, bnxt_inline_avail(txr), + num_segs, num_segs)) + return NETDEV_TX_BUSY; + + if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len))) + goto drop; + + cfa_action = bnxt_xmit_get_cfa_action(skb); + if (skb_vlan_tag_present(skb)) { + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN | + skb_vlan_tag_get(skb); + if (skb->vlan_proto == htons(ETH_P_8021Q)) + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; + } + + csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); + if (!tso.ipv6) + csum |= cpu_to_le32(TX_BD_FLAGS_IP_CKSUM); + + prod = txr->tx_prod; + + for (i = 0; i < num_segs; i++) { + unsigned int seg_payload = min_t(unsigned int, mss, + total_payload - i * mss); + u16 slot = (txr->tx_inline_prod + i) & + (BNXT_SW_USO_MAX_SEGS - 1); + struct bnxt_sw_tx_bd *tx_buf; + unsigned int mapping_len; + dma_addr_t this_hdr_dma; + unsigned int chunk_len; + unsigned int offset; + dma_addr_t dma_addr; + struct tx_bd *txbd; + struct udphdr *uh; + void *this_hdr; + int bd_count; + bool last; + u32 flags; + + last = (i == num_segs - 1); + offset = slot * TSO_HEADER_SIZE; + this_hdr = txr->tx_inline_buf + offset; + this_hdr_dma = txr->tx_inline_dma + offset; + + tso_build_hdr(skb, this_hdr, &tso, seg_payload, last); + + /* Zero stale csum fields copied from the original skb; + * HW offload recomputes from scratch. + */ + uh = this_hdr + skb_transport_offset(skb); + uh->check = 0; + if (!tso.ipv6) { + struct iphdr *iph = this_hdr + skb_network_offset(skb); + + iph->check = 0; + } + + dma_sync_single_for_device(&pdev->dev, this_hdr_dma, + hdr_len, DMA_TO_DEVICE); + + bd_count = tso_dma_map_count(&map, seg_payload); + + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + + tx_buf->skb = skb; + tx_buf->nr_frags = bd_count; + tx_buf->is_push = 0; + tx_buf->is_ts_pkt = 0; + + dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma); + dma_unmap_len_set(tx_buf, len, 0); + + if (last) { + tx_buf->is_sw_gso = BNXT_SW_GSO_LAST; + tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate); + } else { + tx_buf->is_sw_gso = BNXT_SW_GSO_MID; + } + + flags = (hdr_len << TX_BD_LEN_SHIFT) | + TX_BD_TYPE_LONG_TX_BD | + TX_BD_CNT(2 + bd_count); + + flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload); + + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma); + txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, + 2 + bd_count); + + prod = NEXT_TX(prod); + bnxt_init_ext_bd(bp, txr, prod, csum, + vlan_tag_flags, cfa_action); + + /* set dma_unmap_len on the LAST BD touching each + * region. Since completions are in-order, the last segment + * completes after all earlier ones, so the unmap is safe. + */ + while (tso_dma_map_next(&map, &dma_addr, &chunk_len, + &mapping_len, seg_payload)) { + prod = NEXT_TX(prod); + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + + txbd->tx_bd_haddr = cpu_to_le64(dma_addr); + dma_unmap_addr_set(tx_buf, mapping, dma_addr); + dma_unmap_len_set(tx_buf, len, 0); + tx_buf->skb = NULL; + tx_buf->is_sw_gso = 0; + + if (mapping_len) { + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, + mapping, + last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, + len, + last_unmap_len); + } + last_unmap_addr = dma_addr; + last_unmap_len = mapping_len; + } + last_unmap_buf = tx_buf; + + flags = chunk_len << TX_BD_LEN_SHIFT; + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_opaque = 0; + + seg_payload -= chunk_len; + } + + txbd->tx_bd_len_flags_type |= + cpu_to_le32(TX_BD_FLAGS_PACKET_END); + + prod = NEXT_TX(prod); + } + + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, len, last_unmap_len); + } + + txr->tx_inline_prod += num_segs; + + netdev_tx_sent_queue(txq, skb->len); + + WRITE_ONCE(txr->tx_prod, prod); + /* Sync BDs before doorbell */ + wmb(); + bnxt_db_write(bp, &txr->tx_db, prod); + + if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh)) + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + + return NETDEV_TX_OK; + +drop: + dev_kfree_skb_any(skb); + dev_core_stats_tx_dropped_inc(bp->dev); + return NETDEV_TX_OK; +} diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h new file mode 100644 index 000000000000..47528c20f311 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#ifndef BNXT_GSO_H +#define BNXT_GSO_H + +/* Maximum segments the stack may send in a single SW USO skb. + * This caps gso_max_segs for NICs without HW USO support. + */ +#define BNXT_SW_USO_MAX_SEGS 64 + +/* Worst-case TX descriptors consumed by one SW USO packet: + * Each segment: 1 long BD + 1 ext BD + payload BDs. + * Total payload BDs across all segs <= num_segs + nr_frags (each frag + * boundary crossing adds at most 1 extra BD). + * So: 3 * max_segs + MAX_SKB_FRAGS + 1 = 3 * 64 + 17 + 1 = 210. + */ +#define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1) + +static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr) +{ + return BNXT_SW_USO_MAX_SEGS - + (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons)); +} + +static inline int bnxt_min_tx_desc_cnt(struct bnxt *bp, + netdev_features_t features) +{ + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + (features & NETIF_F_GSO_UDP_L4)) + return BNXT_SW_USO_MAX_DESCS; + return BNXT_MIN_TX_DESC_CNT; +} + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb); + +#endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 26fe18bcfad8..2bcf78a4de7b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3764,6 +3764,17 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) } /** + * skb_frag_phys - gets the physical address of the data in a paged fragment + * @frag: the paged fragment buffer + * + * Returns: the physical address of the data within @frag. + */ +static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag) +{ + return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag); +} + +/** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from diff --git a/include/net/tso.h b/include/net/tso.h index e7e157ae0526..da82aabd1d48 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -3,6 +3,7 @@ #define _TSO_H #include <linux/skbuff.h> +#include <linux/dma-mapping.h> #include <net/ip.h> #define TSO_HEADER_SIZE 256 @@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); int tso_start(struct sk_buff *skb, struct tso_t *tso); +/** + * struct tso_dma_map - DMA mapping state for GSO payload + * @dev: device used for DMA mapping + * @skb: the GSO skb being mapped + * @hdr_len: per-segment header length + * @iova_state: DMA IOVA state (when IOMMU available) + * @iova_offset: global byte offset into IOVA range (IOVA path only) + * @total_len: total payload length + * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag) + * @offset: byte offset within current region + * @linear_dma: DMA address of the linear payload + * @linear_len: length of the linear payload + * @nr_frags: number of frags successfully DMA-mapped + * @frags: per-frag DMA address and length + * + * DMA-maps the payload regions of a GSO skb (linear data + frags). + * Prefers the DMA IOVA API for a single contiguous mapping with one + * IOTLB sync; falls back to per-region dma_map_phys() otherwise. + */ +struct tso_dma_map { + struct device *dev; + const struct sk_buff *skb; + unsigned int hdr_len; + /* IOVA path */ + struct dma_iova_state iova_state; + size_t iova_offset; + size_t total_len; + /* Fallback path if IOVA path fails */ + int frag_idx; + unsigned int offset; + dma_addr_t linear_dma; + unsigned int linear_len; + unsigned int nr_frags; + struct { + dma_addr_t dma; + unsigned int len; + } frags[MAX_SKB_FRAGS]; +}; + +/** + * struct tso_dma_map_completion_state - Completion-time cleanup state + * @iova_state: DMA IOVA state (when IOMMU available) + * @total_len: total payload length of the IOVA mapping + * + * Drivers store this on their SW ring at xmit time via + * tso_dma_map_completion_save(), then call tso_dma_map_complete() at + * completion time. + */ +struct tso_dma_map_completion_state { + struct dma_iova_state iova_state; + size_t total_len; +}; + +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len); +void tso_dma_map_cleanup(struct tso_dma_map *map); +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len); +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining); + +/** + * tso_dma_map_completion_save - save state needed for completion-time cleanup + * @map: the xmit-time DMA map + * @cstate: driver-owned storage that persists until completion + * + * Should be called at xmit time to update the completion state and later passed + * to tso_dma_map_complete(). + */ +static inline void +tso_dma_map_completion_save(const struct tso_dma_map *map, + struct tso_dma_map_completion_state *cstate) +{ + cstate->iova_state = map->iova_state; + cstate->total_len = map->total_len; +} + +/** + * tso_dma_map_complete - tear down mapping at completion time + * @dev: the device that owns the mapping + * @cstate: state saved by tso_dma_map_completion_save() + * + * Return: true if the IOVA path was used and the mapping has been + * destroyed; false if the fallback per-region path was used and the + * driver must unmap via its normal completion path. + */ +static inline bool +tso_dma_map_complete(struct device *dev, + struct tso_dma_map_completion_state *cstate) +{ + if (dma_use_iova(&cstate->iova_state)) { + dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len, + DMA_TO_DEVICE, 0); + return true; + } + + return false; +} + #endif /* _TSO_H */ diff --git a/net/core/tso.c b/net/core/tso.c index 6df997b9076e..347b3856ddb9 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -3,6 +3,7 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/tso.h> +#include <linux/dma-mapping.h> #include <linux/unaligned.h> void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, @@ -87,3 +88,271 @@ int tso_start(struct sk_buff *skb, struct tso_t *tso) return hdr_len; } EXPORT_SYMBOL(tso_start); + +static int tso_dma_iova_try(struct device *dev, struct tso_dma_map *map, + phys_addr_t phys, size_t linear_len, + size_t total_len, size_t *offset) +{ + const struct sk_buff *skb; + unsigned int nr_frags; + int i; + + if (!dma_iova_try_alloc(dev, &map->iova_state, phys, total_len)) + return 1; + + skb = map->skb; + nr_frags = skb_shinfo(skb)->nr_frags; + + if (linear_len) { + if (dma_iova_link(dev, &map->iova_state, + phys, *offset, linear_len, + DMA_TO_DEVICE, 0)) + goto iova_fail; + map->linear_len = linear_len; + *offset += linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + if (dma_iova_link(dev, &map->iova_state, + skb_frag_phys(frag), *offset, + frag_len, DMA_TO_DEVICE, 0)) { + map->nr_frags = i; + goto iova_fail; + } + map->frags[i].len = frag_len; + *offset += frag_len; + map->nr_frags = i + 1; + } + + if (dma_iova_sync(dev, &map->iova_state, 0, total_len)) + goto iova_fail; + + return 0; + +iova_fail: + dma_iova_destroy(dev, &map->iova_state, *offset, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + /* reset map state */ + map->frag_idx = -1; + map->offset = 0; + map->linear_len = 0; + map->nr_frags = 0; + + return 1; +} + +/** + * tso_dma_map_init - DMA-map GSO payload regions + * @map: map struct to initialize + * @dev: device for DMA mapping + * @skb: the GSO skb + * @hdr_len: per-segment header length in bytes + * + * DMA-maps the linear payload (after headers) and all frags. + * Prefers the DMA IOVA API (one contiguous mapping, one IOTLB sync); + * falls back to per-region dma_map_phys() when IOVA is not available. + * Positions the iterator at byte 0 of the payload. + * + * Return: 0 on success, -ENOMEM on DMA mapping failure (partial mappings + * are cleaned up internally). + */ +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len) +{ + unsigned int linear_len = skb_headlen(skb) - hdr_len; + unsigned int nr_frags = skb_shinfo(skb)->nr_frags; + size_t total_len = skb->len - hdr_len; + size_t offset = 0; + phys_addr_t phys; + int i; + + map->dev = dev; + map->skb = skb; + map->hdr_len = hdr_len; + map->frag_idx = -1; + map->offset = 0; + map->iova_offset = 0; + map->total_len = total_len; + map->linear_len = 0; + map->nr_frags = 0; + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + if (!total_len) + return 0; + + if (linear_len) + phys = virt_to_phys(skb->data + hdr_len); + else + phys = skb_frag_phys(&skb_shinfo(skb)->frags[0]); + + if (tso_dma_iova_try(dev, map, phys, linear_len, total_len, &offset)) { + /* IOVA path failed, map state was reset. Fallback to + * per-region dma_map_phys() + */ + if (linear_len) { + map->linear_dma = dma_map_phys(dev, phys, linear_len, + DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->linear_dma)) + return -ENOMEM; + map->linear_len = linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + map->frags[i].len = frag_len; + map->frags[i].dma = dma_map_phys(dev, skb_frag_phys(frag), + frag_len, DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->frags[i].dma)) { + tso_dma_map_cleanup(map); + return -ENOMEM; + } + map->nr_frags = i + 1; + } + } + + if (linear_len == 0 && nr_frags > 0) + map->frag_idx = 0; + + return 0; +} +EXPORT_SYMBOL(tso_dma_map_init); + +/** + * tso_dma_map_cleanup - unmap all DMA regions in a tso_dma_map + * @map: the map to clean up + * + * Handles both IOVA and fallback paths. For IOVA, calls + * dma_iova_destroy(). For fallback, unmaps each region individually. + */ +void tso_dma_map_cleanup(struct tso_dma_map *map) +{ + int i; + + if (dma_use_iova(&map->iova_state)) { + dma_iova_destroy(map->dev, &map->iova_state, map->total_len, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + } else { + if (map->linear_len) + dma_unmap_phys(map->dev, map->linear_dma, + map->linear_len, DMA_TO_DEVICE, 0); + + for (i = 0; i < map->nr_frags; i++) + dma_unmap_phys(map->dev, map->frags[i].dma, + map->frags[i].len, DMA_TO_DEVICE, 0); + } + + map->linear_len = 0; + map->nr_frags = 0; +} +EXPORT_SYMBOL(tso_dma_map_cleanup); + +/** + * tso_dma_map_count - count descriptors for a payload range + * @map: the payload map + * @len: number of payload bytes in this segment + * + * Counts how many contiguous DMA region chunks the next @len bytes + * will span, without advancing the iterator. On the IOVA path this + * is always 1 (contiguous). On the fallback path, uses region sizes + * from the current position. + * + * Return: the number of descriptors needed for @len bytes of payload. + */ +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len) +{ + unsigned int offset = map->offset; + int idx = map->frag_idx; + unsigned int count = 0; + + if (!len) + return 0; + + if (dma_use_iova(&map->iova_state)) + return 1; + + while (len > 0) { + unsigned int region_len, chunk; + + if (idx == -1) + region_len = map->linear_len; + else + region_len = map->frags[idx].len; + + chunk = min(len, region_len - offset); + len -= chunk; + count++; + offset = 0; + idx++; + } + + return count; +} +EXPORT_SYMBOL(tso_dma_map_count); + +/** + * tso_dma_map_next - yield the next DMA address range + * @map: the payload map + * @addr: output DMA address + * @chunk_len: output chunk length + * @mapping_len: full DMA mapping length when this chunk starts a new + * mapping region, or 0 when continuing a previous one. + * On the IOVA path this is always 0 (driver must not + * do per-region unmaps; use tso_dma_map_cleanup instead). + * @seg_remaining: bytes left in current segment + * + * Yields the next (dma_addr, chunk_len) pair and advances the iterator. + * On the IOVA path, the entire payload is contiguous so each segment + * is always a single chunk. + * + * Return: true if a chunk was yielded, false when @seg_remaining is 0. + */ +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining) +{ + unsigned int region_len, chunk; + + if (!seg_remaining) + return false; + + /* IOVA path: contiguous DMA range, no region boundaries */ + if (dma_use_iova(&map->iova_state)) { + *addr = map->iova_state.addr + map->iova_offset; + *chunk_len = seg_remaining; + *mapping_len = 0; + map->iova_offset += seg_remaining; + return true; + } + + /* Fallback path: per-region iteration */ + + if (map->frag_idx == -1) { + region_len = map->linear_len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->linear_dma + map->offset; + } else { + region_len = map->frags[map->frag_idx].len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->frags[map->frag_idx].dma + map->offset; + } + + *mapping_len = (map->offset == 0) ? region_len : 0; + *chunk_len = chunk; + map->offset += chunk; + + if (map->offset >= region_len) { + map->frag_idx++; + map->offset = 0; + } + + return true; +} +EXPORT_SYMBOL(tso_dma_map_next); diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index d84e955ac87b..85ca4d1ecf9e 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -45,6 +45,7 @@ TEST_PROGS = \ rss_input_xfrm.py \ toeplitz.py \ tso.py \ + uso.py \ xdp_metadata.py \ xsk_reconfig.py \ # diff --git a/tools/testing/selftests/drivers/net/hw/uso.py b/tools/testing/selftests/drivers/net/hw/uso.py new file mode 100755 index 000000000000..6d61e56cab3c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/uso.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Test USO + +Sends large UDP datagrams with UDP_SEGMENT and verifies that the peer +receives the expected total payload and that the NIC transmitted at least +the expected number of segments. +""" +import random +import socket +import string + +from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_eq, ksft_ge, ksft_variants, KsftNamedVariant +from lib.py import NetDrvEpEnv +from lib.py import bkg, defer, ethtool, ip, rand_port, wait_port_listen + +# python doesn't expose this constant, so we need to hardcode it to enable UDP +# segmentation for large payloads +UDP_SEGMENT = 103 + + +def _send_uso(cfg, ipver, mss, total_payload, port): + if ipver == "4": + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["4"], port) + else: + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["6"], port) + + sock.setsockopt(socket.IPPROTO_UDP, UDP_SEGMENT, mss) + payload = ''.join(random.choice(string.ascii_lowercase) + for _ in range(total_payload)) + sock.sendto(payload.encode(), dst) + sock.close() + + +def _get_tx_packets(cfg): + stats = ip(f"-s link show dev {cfg.ifname}", json=True)[0] + return stats['stats64']['tx']['packets'] + + +def _test_uso(cfg, ipver, mss, total_payload): + cfg.require_ipver(ipver) + cfg.require_cmd("socat", remote=True) + + features = ethtool(f"-k {cfg.ifname}", json=True) + uso_was_on = features[0]["tx-udp-segmentation"]["active"] + + try: + ethtool(f"-K {cfg.ifname} tx-udp-segmentation on") + except Exception as exc: + raise KsftSkipEx( + "Device does not support tx-udp-segmentation") from exc + if not uso_was_on: + defer(ethtool, f"-K {cfg.ifname} tx-udp-segmentation off") + + expected_segs = (total_payload + mss - 1) // mss + + port = rand_port(stype=socket.SOCK_DGRAM) + rx_cmd = f"socat -{ipver} -T 2 -u UDP-LISTEN:{port},reuseport STDOUT" + + tx_before = _get_tx_packets(cfg) + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True) as rx: + wait_port_listen(port, proto="udp", host=cfg.remote) + _send_uso(cfg, ipver, mss, total_payload, port) + + ksft_eq(len(rx.stdout), total_payload, + comment=f"Received {len(rx.stdout)}B, expected {total_payload}B") + + cfg.wait_hw_stats_settle() + + tx_after = _get_tx_packets(cfg) + tx_delta = tx_after - tx_before + + ksft_ge(tx_delta, expected_segs, + comment=f"Expected >= {expected_segs} tx packets, got {tx_delta}") + + +def _uso_variants(): + for ipver in ["4", "6"]: + yield KsftNamedVariant(f"v{ipver}_partial", ipver, 1400, 1400 * 10 + 500) + yield KsftNamedVariant(f"v{ipver}_exact", ipver, 1400, 1400 * 5) + + +@ksft_variants(_uso_variants()) +def test_uso(cfg, ipver, mss, total_payload): + """Send a USO datagram and verify the peer receives the expected segments.""" + _test_uso(cfg, ipver, mss, total_payload) + + +def main() -> None: + """Run USO tests.""" + with NetDrvEpEnv(__file__) as cfg: + ksft_run([test_uso], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() |
