summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/Makefile2
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.c183
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.h32
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c19
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c240
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h46
-rw-r--r--include/linux/skbuff.h11
-rw-r--r--include/net/tso.h100
-rw-r--r--net/core/tso.c269
-rw-r--r--tools/testing/selftests/drivers/net/hw/Makefile1
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/uso.py103
11 files changed, 967 insertions, 39 deletions
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index ba6c239d52fa..debef78c8b6d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_BNXT) += bnxt_en.o
-bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o
+bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o bnxt_gso.o
bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o
bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fe8b886ff82e..2715632115a5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -74,6 +74,8 @@
#include "bnxt_debugfs.h"
#include "bnxt_coredump.h"
#include "bnxt_hwmon.h"
+#include "bnxt_gso.h"
+#include <net/tso.h>
#define BNXT_TX_TIMEOUT (5 * HZ)
#define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \
@@ -447,7 +449,7 @@ const u16 bnxt_lhint_arr[] = {
TX_BD_FLAGS_LHINT_2048_AND_LARGER,
};
-static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb)
+u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb)
{
struct metadata_dst *md_dst = skb_metadata_dst(skb);
@@ -506,6 +508,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
}
}
#endif
+ if (skb_is_gso(skb) &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) &&
+ !(bp->flags & BNXT_FLAG_UDP_GSO_CAP))
+ return bnxt_sw_udp_gso_xmit(bp, txr, txq, skb);
+
free_size = bnxt_tx_avail(bp, txr);
if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) {
/* We must have raced with NAPI cleanup */
@@ -656,6 +663,7 @@ normal_tx:
goto tx_free;
dma_unmap_addr_set(tx_buf, mapping, mapping);
+ dma_unmap_len_set(tx_buf, len, len);
flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD |
TX_BD_CNT(last_frag + 2);
@@ -663,10 +671,9 @@ normal_tx:
txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag);
prod = NEXT_TX(prod);
- txbd1 = (struct tx_bd_ext *)
- &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+ txbd1 = bnxt_init_ext_bd(bp, txr, prod, lflags, vlan_tag_flags,
+ cfa_action);
- txbd1->tx_bd_hsize_lflags = lflags;
if (skb_is_gso(skb)) {
bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4);
u32 hdr_len;
@@ -693,7 +700,6 @@ normal_tx:
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
txbd1->tx_bd_hsize_lflags |=
cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
- txbd1->tx_bd_mss = 0;
}
length >>= 9;
@@ -706,9 +712,6 @@ normal_tx:
flags |= bnxt_lhint_arr[length];
txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
- txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
- txbd1->tx_bd_cfa_action =
- cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
txbd0 = txbd;
for (i = 0; i < last_frag; i++) {
frag = &skb_shinfo(skb)->frags[i];
@@ -725,6 +728,7 @@ normal_tx:
tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
netmem_dma_unmap_addr_set(skb_frag_netmem(frag), tx_buf,
mapping, mapping);
+ dma_unmap_len_set(tx_buf, len, len);
txbd->tx_bd_haddr = cpu_to_le64(mapping);
@@ -814,17 +818,19 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
u16 hw_cons = txr->tx_hw_cons;
unsigned int tx_bytes = 0;
u16 cons = txr->tx_cons;
- skb_frag_t *frag;
+ unsigned int dma_len;
+ dma_addr_t dma_addr;
int tx_pkts = 0;
bool rc = false;
while (RING_TX(bp, cons) != hw_cons) {
- struct bnxt_sw_tx_bd *tx_buf;
+ struct bnxt_sw_tx_bd *tx_buf, *head_buf;
struct sk_buff *skb;
bool is_ts_pkt;
int j, last;
tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
+ head_buf = tx_buf;
skb = tx_buf->skb;
if (unlikely(!skb)) {
@@ -849,20 +855,44 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
goto next_tx_int;
}
- dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping),
- skb_headlen(skb), DMA_TO_DEVICE);
+ if (dma_unmap_len(tx_buf, len)) {
+ dma_addr = dma_unmap_addr(tx_buf, mapping);
+ dma_len = dma_unmap_len(tx_buf, len);
+
+ dma_unmap_single(&pdev->dev, dma_addr, dma_len,
+ DMA_TO_DEVICE);
+ }
+
last = tx_buf->nr_frags;
for (j = 0; j < last; j++) {
- frag = &skb_shinfo(skb)->frags[j];
cons = NEXT_TX(cons);
tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
- netmem_dma_unmap_page_attrs(&pdev->dev,
- dma_unmap_addr(tx_buf,
- mapping),
- skb_frag_size(frag),
- DMA_TO_DEVICE, 0);
+ if (dma_unmap_len(tx_buf, len)) {
+ dma_addr = dma_unmap_addr(tx_buf, mapping);
+ dma_len = dma_unmap_len(tx_buf, len);
+
+ netmem_dma_unmap_page_attrs(&pdev->dev,
+ dma_addr, dma_len,
+ DMA_TO_DEVICE, 0);
+ }
}
+
+ if (unlikely(head_buf->is_sw_gso)) {
+ u16 inline_cons = txr->tx_inline_cons + 1;
+
+ WRITE_ONCE(txr->tx_inline_cons, inline_cons);
+ if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+ tso_dma_map_complete(&pdev->dev,
+ &head_buf->sw_gso_cstate);
+ } else {
+ tx_pkts--;
+ tx_bytes -= skb->len;
+ skb = NULL;
+ }
+ head_buf->is_sw_gso = 0;
+ }
+
if (unlikely(is_ts_pkt)) {
if (BNXT_CHIP_P5(bp)) {
/* PTP worker takes ownership of the skb */
@@ -3399,19 +3429,23 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
{
int i, max_idx;
struct pci_dev *pdev = bp->pdev;
+ unsigned int dma_len;
+ dma_addr_t dma_addr;
max_idx = bp->tx_nr_pages * TX_DESC_CNT;
for (i = 0; i < max_idx;) {
struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i];
+ struct bnxt_sw_tx_bd *head_buf = tx_buf;
struct sk_buff *skb;
int j, last;
if (idx < bp->tx_nr_rings_xdp &&
tx_buf->action == XDP_REDIRECT) {
- dma_unmap_single(&pdev->dev,
- dma_unmap_addr(tx_buf, mapping),
- dma_unmap_len(tx_buf, len),
+ dma_addr = dma_unmap_addr(tx_buf, mapping);
+ dma_len = dma_unmap_len(tx_buf, len);
+
+ dma_unmap_single(&pdev->dev, dma_addr, dma_len,
DMA_TO_DEVICE);
xdp_return_frame(tx_buf->xdpf);
tx_buf->action = 0;
@@ -3434,25 +3468,43 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
continue;
}
- dma_unmap_single(&pdev->dev,
- dma_unmap_addr(tx_buf, mapping),
- skb_headlen(skb),
- DMA_TO_DEVICE);
+ if (dma_unmap_len(tx_buf, len)) {
+ dma_addr = dma_unmap_addr(tx_buf, mapping);
+ dma_len = dma_unmap_len(tx_buf, len);
+
+ dma_unmap_single(&pdev->dev, dma_addr, dma_len,
+ DMA_TO_DEVICE);
+ }
last = tx_buf->nr_frags;
i += 2;
for (j = 0; j < last; j++, i++) {
int ring_idx = i & bp->tx_ring_mask;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[j];
tx_buf = &txr->tx_buf_ring[ring_idx];
- netmem_dma_unmap_page_attrs(&pdev->dev,
- dma_unmap_addr(tx_buf,
- mapping),
- skb_frag_size(frag),
- DMA_TO_DEVICE, 0);
+ if (dma_unmap_len(tx_buf, len)) {
+ dma_addr = dma_unmap_addr(tx_buf, mapping);
+ dma_len = dma_unmap_len(tx_buf, len);
+
+ netmem_dma_unmap_page_attrs(&pdev->dev,
+ dma_addr, dma_len,
+ DMA_TO_DEVICE, 0);
+ }
}
- dev_kfree_skb(skb);
+ if (head_buf->is_sw_gso) {
+ u16 inline_cons = txr->tx_inline_cons + 1;
+
+ WRITE_ONCE(txr->tx_inline_cons, inline_cons);
+ if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+ tso_dma_map_complete(&pdev->dev,
+ &head_buf->sw_gso_cstate);
+ } else {
+ skb = NULL;
+ }
+ head_buf->is_sw_gso = 0;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
}
netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx));
}
@@ -3965,6 +4017,39 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
return rc;
}
+static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr,
+ struct pci_dev *pdev)
+{
+ if (!txr->tx_inline_buf)
+ return;
+
+ dma_unmap_single(&pdev->dev, txr->tx_inline_dma,
+ txr->tx_inline_size, DMA_TO_DEVICE);
+ kfree(txr->tx_inline_buf);
+ txr->tx_inline_buf = NULL;
+ txr->tx_inline_size = 0;
+}
+
+static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr,
+ struct pci_dev *pdev,
+ unsigned int size)
+{
+ txr->tx_inline_buf = kmalloc(size, GFP_KERNEL);
+ if (!txr->tx_inline_buf)
+ return -ENOMEM;
+
+ txr->tx_inline_dma = dma_map_single(&pdev->dev, txr->tx_inline_buf,
+ size, DMA_TO_DEVICE);
+ if (dma_mapping_error(&pdev->dev, txr->tx_inline_dma)) {
+ kfree(txr->tx_inline_buf);
+ txr->tx_inline_buf = NULL;
+ return -ENOMEM;
+ }
+ txr->tx_inline_size = size;
+
+ return 0;
+}
+
static void bnxt_free_tx_rings(struct bnxt *bp)
{
int i;
@@ -3983,6 +4068,8 @@ static void bnxt_free_tx_rings(struct bnxt *bp)
txr->tx_push = NULL;
}
+ bnxt_free_tx_inline_buf(txr, pdev);
+
ring = &txr->tx_ring_struct;
bnxt_free_ring(bp, &ring->ring_mem);
@@ -4048,6 +4135,13 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
sizeof(struct tx_push_bd);
txr->data_mapping = cpu_to_le64(mapping);
}
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+ rc = bnxt_alloc_tx_inline_buf(txr, pdev,
+ BNXT_SW_USO_MAX_SEGS *
+ TSO_HEADER_SIZE);
+ if (rc)
+ return rc;
+ }
qidx = bp->tc_to_qidx[j];
ring->queue_id = bp->q_info[qidx].queue_id;
spin_lock_init(&txr->xdp_tx_lock);
@@ -4586,10 +4680,13 @@ static int bnxt_init_rx_rings(struct bnxt *bp)
static int bnxt_init_tx_rings(struct bnxt *bp)
{
+ netdev_features_t features;
u16 i;
+ features = bp->dev->features;
+
bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
- BNXT_MIN_TX_DESC_CNT);
+ bnxt_min_tx_desc_cnt(bp, features));
for (i = 0; i < bp->tx_nr_rings; i++) {
struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
@@ -13788,6 +13885,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev,
if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false))
features &= ~NETIF_F_NTUPLE;
+ if ((features & NETIF_F_GSO_UDP_L4) &&
+ !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS)
+ features &= ~NETIF_F_GSO_UDP_L4;
+
if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog)
features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
@@ -13833,6 +13935,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
int rc = 0;
bool re_init = false;
+ bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
+ bnxt_min_tx_desc_cnt(bp, features));
+
flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS;
if (features & NETIF_F_GRO_HW)
flags |= BNXT_FLAG_GRO;
@@ -16858,8 +16963,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
NETIF_F_RXCSUM | NETIF_F_GRO;
- if (bp->flags & BNXT_FLAG_UDP_GSO_CAP)
- dev->hw_features |= NETIF_F_GSO_UDP_L4;
+ dev->hw_features |= NETIF_F_GSO_UDP_L4;
if (BNXT_SUPPORTS_TPA(bp))
dev->hw_features |= NETIF_F_LRO;
@@ -16892,8 +16996,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
dev->priv_flags |= IFF_UNICAST_FLT;
netif_set_tso_max_size(dev, GSO_MAX_SIZE);
- if (bp->tso_max_segs)
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+ u16 max_segs = BNXT_SW_USO_MAX_SEGS;
+
+ if (bp->tso_max_segs)
+ max_segs = min_t(u16, max_segs, bp->tso_max_segs);
+ netif_set_tso_max_segs(dev, max_segs);
+ } else if (bp->tso_max_segs) {
netif_set_tso_max_segs(dev, bp->tso_max_segs);
+ }
dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_RX_SG;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3558a36ece12..fe50576ae525 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -11,6 +11,8 @@
#ifndef BNXT_H
#define BNXT_H
+#include <net/tso.h>
+
#define DRV_MODULE_NAME "bnxt_en"
/* DO NOT CHANGE DRV_VER_* defines
@@ -892,14 +894,19 @@ struct bnxt_sw_tx_bd {
struct page *page;
u8 is_ts_pkt;
u8 is_push;
+ u8 is_sw_gso;
u8 action;
unsigned short nr_frags;
union {
u16 rx_prod;
u16 txts_prod;
};
+ struct tso_dma_map_completion_state sw_gso_cstate;
};
+#define BNXT_SW_GSO_MID 1
+#define BNXT_SW_GSO_LAST 2
+
struct bnxt_sw_rx_bd {
void *data;
u8 *data_ptr;
@@ -996,6 +1003,12 @@ struct bnxt_tx_ring_info {
dma_addr_t tx_push_mapping;
__le64 data_mapping;
+ void *tx_inline_buf;
+ dma_addr_t tx_inline_dma;
+ unsigned int tx_inline_size;
+ u16 tx_inline_prod;
+ u16 tx_inline_cons;
+
#define BNXT_DEV_STATE_CLOSING 0x1
u32 dev_state;
@@ -2836,6 +2849,24 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp,
return bp->tx_ring_size - (used & bp->tx_ring_mask);
}
+static inline struct tx_bd_ext *
+bnxt_init_ext_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+ u16 prod, __le32 lflags, u32 vlan_tag_flags,
+ u32 cfa_action)
+{
+ struct tx_bd_ext *txbd1;
+
+ txbd1 = (struct tx_bd_ext *)
+ &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+ txbd1->tx_bd_hsize_lflags = lflags;
+ txbd1->tx_bd_mss = 0;
+ txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
+ txbd1->tx_bd_cfa_action =
+ cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
+
+ return txbd1;
+}
+
static inline void bnxt_writeq(struct bnxt *bp, u64 val,
volatile void __iomem *addr)
{
@@ -2969,6 +3000,7 @@ unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp);
int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init);
void bnxt_tx_disable(struct bnxt *bp);
void bnxt_tx_enable(struct bnxt *bp);
+u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb);
void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
u16 curr);
void bnxt_report_link(struct bnxt *bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 6826bf762d26..9ded88196bb4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -33,6 +33,7 @@
#include "bnxt_xdp.h"
#include "bnxt_ptp.h"
#include "bnxt_ethtool.h"
+#include "bnxt_gso.h"
#include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */
#include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */
#include "bnxt_coredump.h"
@@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev,
u8 tcp_data_split = kernel_ering->tcp_data_split;
struct bnxt *bp = netdev_priv(dev);
u8 hds_config_mod;
+ int rc;
if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) ||
(ering->tx_pending > BNXT_MAX_TX_DESC_CNT) ||
(ering->tx_pending < BNXT_MIN_TX_DESC_CNT))
return -EINVAL;
+ if ((dev->features & NETIF_F_GSO_UDP_L4) &&
+ !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS)
+ return -EINVAL;
+
hds_config_mod = tcp_data_split != dev->cfg->hds_config;
if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod)
return -EINVAL;
@@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev,
bp->tx_ring_size = ering->tx_pending;
bnxt_set_ring_params(bp);
- if (netif_running(dev))
- return bnxt_open_nic(bp, false, false);
+ if (netif_running(dev)) {
+ rc = bnxt_open_nic(bp, false, false);
+ if (rc)
+ return rc;
+ }
+ /* ring size changes may affect features (SW USO requires a minimum
+ * ring size), so recalculate features to ensure the correct features
+ * are blocked/available.
+ */
+ netdev_update_features(dev);
return 0;
}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
new file mode 100644
index 000000000000..f317f60414e8
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/netdev_queues.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/tso.h>
+#include <linux/bnxt/hsi.h>
+
+#include "bnxt.h"
+#include "bnxt_gso.h"
+
+static u32 bnxt_sw_gso_lhint(unsigned int len)
+{
+ if (len <= 512)
+ return TX_BD_FLAGS_LHINT_512_AND_SMALLER;
+ else if (len <= 1023)
+ return TX_BD_FLAGS_LHINT_512_TO_1023;
+ else if (len <= 2047)
+ return TX_BD_FLAGS_LHINT_1024_TO_2047;
+ else
+ return TX_BD_FLAGS_LHINT_2048_AND_LARGER;
+}
+
+netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
+ struct bnxt_tx_ring_info *txr,
+ struct netdev_queue *txq,
+ struct sk_buff *skb)
+{
+ unsigned int last_unmap_len __maybe_unused = 0;
+ dma_addr_t last_unmap_addr __maybe_unused = 0;
+ struct bnxt_sw_tx_bd *last_unmap_buf = NULL;
+ unsigned int hdr_len, mss, num_segs;
+ struct pci_dev *pdev = bp->pdev;
+ unsigned int total_payload;
+ struct tso_dma_map map;
+ u32 vlan_tag_flags = 0;
+ int i, bds_needed;
+ struct tso_t tso;
+ u16 cfa_action;
+ __le32 csum;
+ u16 prod;
+
+ hdr_len = tso_start(skb, &tso);
+ mss = skb_shinfo(skb)->gso_size;
+ total_payload = skb->len - hdr_len;
+ num_segs = DIV_ROUND_UP(total_payload, mss);
+
+ if (unlikely(num_segs <= 1))
+ goto drop;
+
+ /* Upper bound on the number of descriptors needed.
+ *
+ * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is
+ * at most num_segs + nr_frags (each frag boundary crossing adds at
+ * most 1 extra BD).
+ */
+ bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1;
+
+ if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) {
+ netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
+ bp->tx_wake_thresh);
+ return NETDEV_TX_BUSY;
+ }
+
+ /* BD backpressure alone cannot prevent overwriting in-flight
+ * headers in the inline buffer. Check slot availability directly.
+ */
+ if (!netif_txq_maybe_stop(txq, bnxt_inline_avail(txr),
+ num_segs, num_segs))
+ return NETDEV_TX_BUSY;
+
+ if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len)))
+ goto drop;
+
+ cfa_action = bnxt_xmit_get_cfa_action(skb);
+ if (skb_vlan_tag_present(skb)) {
+ vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
+ skb_vlan_tag_get(skb);
+ if (skb->vlan_proto == htons(ETH_P_8021Q))
+ vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
+ }
+
+ csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
+ if (!tso.ipv6)
+ csum |= cpu_to_le32(TX_BD_FLAGS_IP_CKSUM);
+
+ prod = txr->tx_prod;
+
+ for (i = 0; i < num_segs; i++) {
+ unsigned int seg_payload = min_t(unsigned int, mss,
+ total_payload - i * mss);
+ u16 slot = (txr->tx_inline_prod + i) &
+ (BNXT_SW_USO_MAX_SEGS - 1);
+ struct bnxt_sw_tx_bd *tx_buf;
+ unsigned int mapping_len;
+ dma_addr_t this_hdr_dma;
+ unsigned int chunk_len;
+ unsigned int offset;
+ dma_addr_t dma_addr;
+ struct tx_bd *txbd;
+ struct udphdr *uh;
+ void *this_hdr;
+ int bd_count;
+ bool last;
+ u32 flags;
+
+ last = (i == num_segs - 1);
+ offset = slot * TSO_HEADER_SIZE;
+ this_hdr = txr->tx_inline_buf + offset;
+ this_hdr_dma = txr->tx_inline_dma + offset;
+
+ tso_build_hdr(skb, this_hdr, &tso, seg_payload, last);
+
+ /* Zero stale csum fields copied from the original skb;
+ * HW offload recomputes from scratch.
+ */
+ uh = this_hdr + skb_transport_offset(skb);
+ uh->check = 0;
+ if (!tso.ipv6) {
+ struct iphdr *iph = this_hdr + skb_network_offset(skb);
+
+ iph->check = 0;
+ }
+
+ dma_sync_single_for_device(&pdev->dev, this_hdr_dma,
+ hdr_len, DMA_TO_DEVICE);
+
+ bd_count = tso_dma_map_count(&map, seg_payload);
+
+ tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
+ txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+
+ tx_buf->skb = skb;
+ tx_buf->nr_frags = bd_count;
+ tx_buf->is_push = 0;
+ tx_buf->is_ts_pkt = 0;
+
+ dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma);
+ dma_unmap_len_set(tx_buf, len, 0);
+
+ if (last) {
+ tx_buf->is_sw_gso = BNXT_SW_GSO_LAST;
+ tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate);
+ } else {
+ tx_buf->is_sw_gso = BNXT_SW_GSO_MID;
+ }
+
+ flags = (hdr_len << TX_BD_LEN_SHIFT) |
+ TX_BD_TYPE_LONG_TX_BD |
+ TX_BD_CNT(2 + bd_count);
+
+ flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload);
+
+ txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+ txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma);
+ txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod,
+ 2 + bd_count);
+
+ prod = NEXT_TX(prod);
+ bnxt_init_ext_bd(bp, txr, prod, csum,
+ vlan_tag_flags, cfa_action);
+
+ /* set dma_unmap_len on the LAST BD touching each
+ * region. Since completions are in-order, the last segment
+ * completes after all earlier ones, so the unmap is safe.
+ */
+ while (tso_dma_map_next(&map, &dma_addr, &chunk_len,
+ &mapping_len, seg_payload)) {
+ prod = NEXT_TX(prod);
+ txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+ tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
+
+ txbd->tx_bd_haddr = cpu_to_le64(dma_addr);
+ dma_unmap_addr_set(tx_buf, mapping, dma_addr);
+ dma_unmap_len_set(tx_buf, len, 0);
+ tx_buf->skb = NULL;
+ tx_buf->is_sw_gso = 0;
+
+ if (mapping_len) {
+ if (last_unmap_buf) {
+ dma_unmap_addr_set(last_unmap_buf,
+ mapping,
+ last_unmap_addr);
+ dma_unmap_len_set(last_unmap_buf,
+ len,
+ last_unmap_len);
+ }
+ last_unmap_addr = dma_addr;
+ last_unmap_len = mapping_len;
+ }
+ last_unmap_buf = tx_buf;
+
+ flags = chunk_len << TX_BD_LEN_SHIFT;
+ txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+ txbd->tx_bd_opaque = 0;
+
+ seg_payload -= chunk_len;
+ }
+
+ txbd->tx_bd_len_flags_type |=
+ cpu_to_le32(TX_BD_FLAGS_PACKET_END);
+
+ prod = NEXT_TX(prod);
+ }
+
+ if (last_unmap_buf) {
+ dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr);
+ dma_unmap_len_set(last_unmap_buf, len, last_unmap_len);
+ }
+
+ txr->tx_inline_prod += num_segs;
+
+ netdev_tx_sent_queue(txq, skb->len);
+
+ WRITE_ONCE(txr->tx_prod, prod);
+ /* Sync BDs before doorbell */
+ wmb();
+ bnxt_db_write(bp, &txr->tx_db, prod);
+
+ if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh))
+ netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
+ bp->tx_wake_thresh);
+
+ return NETDEV_TX_OK;
+
+drop:
+ dev_kfree_skb_any(skb);
+ dev_core_stats_tx_dropped_inc(bp->dev);
+ return NETDEV_TX_OK;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h
new file mode 100644
index 000000000000..47528c20f311
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Broadcom NetXtreme-C/E network driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_GSO_H
+#define BNXT_GSO_H
+
+/* Maximum segments the stack may send in a single SW USO skb.
+ * This caps gso_max_segs for NICs without HW USO support.
+ */
+#define BNXT_SW_USO_MAX_SEGS 64
+
+/* Worst-case TX descriptors consumed by one SW USO packet:
+ * Each segment: 1 long BD + 1 ext BD + payload BDs.
+ * Total payload BDs across all segs <= num_segs + nr_frags (each frag
+ * boundary crossing adds at most 1 extra BD).
+ * So: 3 * max_segs + MAX_SKB_FRAGS + 1 = 3 * 64 + 17 + 1 = 210.
+ */
+#define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1)
+
+static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr)
+{
+ return BNXT_SW_USO_MAX_SEGS -
+ (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons));
+}
+
+static inline int bnxt_min_tx_desc_cnt(struct bnxt *bp,
+ netdev_features_t features)
+{
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ (features & NETIF_F_GSO_UDP_L4))
+ return BNXT_SW_USO_MAX_DESCS;
+ return BNXT_MIN_TX_DESC_CNT;
+}
+
+netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
+ struct bnxt_tx_ring_info *txr,
+ struct netdev_queue *txq,
+ struct sk_buff *skb);
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 26fe18bcfad8..2bcf78a4de7b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3764,6 +3764,17 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag)
}
/**
+ * skb_frag_phys - gets the physical address of the data in a paged fragment
+ * @frag: the paged fragment buffer
+ *
+ * Returns: the physical address of the data within @frag.
+ */
+static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag)
+{
+ return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag);
+}
+
+/**
* skb_frag_page_copy() - sets the page in a fragment from another fragment
* @fragto: skb fragment where page is set
* @fragfrom: skb fragment page is copied from
diff --git a/include/net/tso.h b/include/net/tso.h
index e7e157ae0526..da82aabd1d48 100644
--- a/include/net/tso.h
+++ b/include/net/tso.h
@@ -3,6 +3,7 @@
#define _TSO_H
#include <linux/skbuff.h>
+#include <linux/dma-mapping.h>
#include <net/ip.h>
#define TSO_HEADER_SIZE 256
@@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size);
int tso_start(struct sk_buff *skb, struct tso_t *tso);
+/**
+ * struct tso_dma_map - DMA mapping state for GSO payload
+ * @dev: device used for DMA mapping
+ * @skb: the GSO skb being mapped
+ * @hdr_len: per-segment header length
+ * @iova_state: DMA IOVA state (when IOMMU available)
+ * @iova_offset: global byte offset into IOVA range (IOVA path only)
+ * @total_len: total payload length
+ * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag)
+ * @offset: byte offset within current region
+ * @linear_dma: DMA address of the linear payload
+ * @linear_len: length of the linear payload
+ * @nr_frags: number of frags successfully DMA-mapped
+ * @frags: per-frag DMA address and length
+ *
+ * DMA-maps the payload regions of a GSO skb (linear data + frags).
+ * Prefers the DMA IOVA API for a single contiguous mapping with one
+ * IOTLB sync; falls back to per-region dma_map_phys() otherwise.
+ */
+struct tso_dma_map {
+ struct device *dev;
+ const struct sk_buff *skb;
+ unsigned int hdr_len;
+ /* IOVA path */
+ struct dma_iova_state iova_state;
+ size_t iova_offset;
+ size_t total_len;
+ /* Fallback path if IOVA path fails */
+ int frag_idx;
+ unsigned int offset;
+ dma_addr_t linear_dma;
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ struct {
+ dma_addr_t dma;
+ unsigned int len;
+ } frags[MAX_SKB_FRAGS];
+};
+
+/**
+ * struct tso_dma_map_completion_state - Completion-time cleanup state
+ * @iova_state: DMA IOVA state (when IOMMU available)
+ * @total_len: total payload length of the IOVA mapping
+ *
+ * Drivers store this on their SW ring at xmit time via
+ * tso_dma_map_completion_save(), then call tso_dma_map_complete() at
+ * completion time.
+ */
+struct tso_dma_map_completion_state {
+ struct dma_iova_state iova_state;
+ size_t total_len;
+};
+
+int tso_dma_map_init(struct tso_dma_map *map, struct device *dev,
+ const struct sk_buff *skb, unsigned int hdr_len);
+void tso_dma_map_cleanup(struct tso_dma_map *map);
+unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len);
+bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr,
+ unsigned int *chunk_len, unsigned int *mapping_len,
+ unsigned int seg_remaining);
+
+/**
+ * tso_dma_map_completion_save - save state needed for completion-time cleanup
+ * @map: the xmit-time DMA map
+ * @cstate: driver-owned storage that persists until completion
+ *
+ * Should be called at xmit time to update the completion state and later passed
+ * to tso_dma_map_complete().
+ */
+static inline void
+tso_dma_map_completion_save(const struct tso_dma_map *map,
+ struct tso_dma_map_completion_state *cstate)
+{
+ cstate->iova_state = map->iova_state;
+ cstate->total_len = map->total_len;
+}
+
+/**
+ * tso_dma_map_complete - tear down mapping at completion time
+ * @dev: the device that owns the mapping
+ * @cstate: state saved by tso_dma_map_completion_save()
+ *
+ * Return: true if the IOVA path was used and the mapping has been
+ * destroyed; false if the fallback per-region path was used and the
+ * driver must unmap via its normal completion path.
+ */
+static inline bool
+tso_dma_map_complete(struct device *dev,
+ struct tso_dma_map_completion_state *cstate)
+{
+ if (dma_use_iova(&cstate->iova_state)) {
+ dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len,
+ DMA_TO_DEVICE, 0);
+ return true;
+ }
+
+ return false;
+}
+
#endif /* _TSO_H */
diff --git a/net/core/tso.c b/net/core/tso.c
index 6df997b9076e..347b3856ddb9 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -3,6 +3,7 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/tso.h>
+#include <linux/dma-mapping.h>
#include <linux/unaligned.h>
void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
@@ -87,3 +88,271 @@ int tso_start(struct sk_buff *skb, struct tso_t *tso)
return hdr_len;
}
EXPORT_SYMBOL(tso_start);
+
+static int tso_dma_iova_try(struct device *dev, struct tso_dma_map *map,
+ phys_addr_t phys, size_t linear_len,
+ size_t total_len, size_t *offset)
+{
+ const struct sk_buff *skb;
+ unsigned int nr_frags;
+ int i;
+
+ if (!dma_iova_try_alloc(dev, &map->iova_state, phys, total_len))
+ return 1;
+
+ skb = map->skb;
+ nr_frags = skb_shinfo(skb)->nr_frags;
+
+ if (linear_len) {
+ if (dma_iova_link(dev, &map->iova_state,
+ phys, *offset, linear_len,
+ DMA_TO_DEVICE, 0))
+ goto iova_fail;
+ map->linear_len = linear_len;
+ *offset += linear_len;
+ }
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ unsigned int frag_len = skb_frag_size(frag);
+
+ if (dma_iova_link(dev, &map->iova_state,
+ skb_frag_phys(frag), *offset,
+ frag_len, DMA_TO_DEVICE, 0)) {
+ map->nr_frags = i;
+ goto iova_fail;
+ }
+ map->frags[i].len = frag_len;
+ *offset += frag_len;
+ map->nr_frags = i + 1;
+ }
+
+ if (dma_iova_sync(dev, &map->iova_state, 0, total_len))
+ goto iova_fail;
+
+ return 0;
+
+iova_fail:
+ dma_iova_destroy(dev, &map->iova_state, *offset,
+ DMA_TO_DEVICE, 0);
+ memset(&map->iova_state, 0, sizeof(map->iova_state));
+
+ /* reset map state */
+ map->frag_idx = -1;
+ map->offset = 0;
+ map->linear_len = 0;
+ map->nr_frags = 0;
+
+ return 1;
+}
+
+/**
+ * tso_dma_map_init - DMA-map GSO payload regions
+ * @map: map struct to initialize
+ * @dev: device for DMA mapping
+ * @skb: the GSO skb
+ * @hdr_len: per-segment header length in bytes
+ *
+ * DMA-maps the linear payload (after headers) and all frags.
+ * Prefers the DMA IOVA API (one contiguous mapping, one IOTLB sync);
+ * falls back to per-region dma_map_phys() when IOVA is not available.
+ * Positions the iterator at byte 0 of the payload.
+ *
+ * Return: 0 on success, -ENOMEM on DMA mapping failure (partial mappings
+ * are cleaned up internally).
+ */
+int tso_dma_map_init(struct tso_dma_map *map, struct device *dev,
+ const struct sk_buff *skb, unsigned int hdr_len)
+{
+ unsigned int linear_len = skb_headlen(skb) - hdr_len;
+ unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
+ size_t total_len = skb->len - hdr_len;
+ size_t offset = 0;
+ phys_addr_t phys;
+ int i;
+
+ map->dev = dev;
+ map->skb = skb;
+ map->hdr_len = hdr_len;
+ map->frag_idx = -1;
+ map->offset = 0;
+ map->iova_offset = 0;
+ map->total_len = total_len;
+ map->linear_len = 0;
+ map->nr_frags = 0;
+ memset(&map->iova_state, 0, sizeof(map->iova_state));
+
+ if (!total_len)
+ return 0;
+
+ if (linear_len)
+ phys = virt_to_phys(skb->data + hdr_len);
+ else
+ phys = skb_frag_phys(&skb_shinfo(skb)->frags[0]);
+
+ if (tso_dma_iova_try(dev, map, phys, linear_len, total_len, &offset)) {
+ /* IOVA path failed, map state was reset. Fallback to
+ * per-region dma_map_phys()
+ */
+ if (linear_len) {
+ map->linear_dma = dma_map_phys(dev, phys, linear_len,
+ DMA_TO_DEVICE, 0);
+ if (dma_mapping_error(dev, map->linear_dma))
+ return -ENOMEM;
+ map->linear_len = linear_len;
+ }
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ unsigned int frag_len = skb_frag_size(frag);
+
+ map->frags[i].len = frag_len;
+ map->frags[i].dma = dma_map_phys(dev, skb_frag_phys(frag),
+ frag_len, DMA_TO_DEVICE, 0);
+ if (dma_mapping_error(dev, map->frags[i].dma)) {
+ tso_dma_map_cleanup(map);
+ return -ENOMEM;
+ }
+ map->nr_frags = i + 1;
+ }
+ }
+
+ if (linear_len == 0 && nr_frags > 0)
+ map->frag_idx = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL(tso_dma_map_init);
+
+/**
+ * tso_dma_map_cleanup - unmap all DMA regions in a tso_dma_map
+ * @map: the map to clean up
+ *
+ * Handles both IOVA and fallback paths. For IOVA, calls
+ * dma_iova_destroy(). For fallback, unmaps each region individually.
+ */
+void tso_dma_map_cleanup(struct tso_dma_map *map)
+{
+ int i;
+
+ if (dma_use_iova(&map->iova_state)) {
+ dma_iova_destroy(map->dev, &map->iova_state, map->total_len,
+ DMA_TO_DEVICE, 0);
+ memset(&map->iova_state, 0, sizeof(map->iova_state));
+ } else {
+ if (map->linear_len)
+ dma_unmap_phys(map->dev, map->linear_dma,
+ map->linear_len, DMA_TO_DEVICE, 0);
+
+ for (i = 0; i < map->nr_frags; i++)
+ dma_unmap_phys(map->dev, map->frags[i].dma,
+ map->frags[i].len, DMA_TO_DEVICE, 0);
+ }
+
+ map->linear_len = 0;
+ map->nr_frags = 0;
+}
+EXPORT_SYMBOL(tso_dma_map_cleanup);
+
+/**
+ * tso_dma_map_count - count descriptors for a payload range
+ * @map: the payload map
+ * @len: number of payload bytes in this segment
+ *
+ * Counts how many contiguous DMA region chunks the next @len bytes
+ * will span, without advancing the iterator. On the IOVA path this
+ * is always 1 (contiguous). On the fallback path, uses region sizes
+ * from the current position.
+ *
+ * Return: the number of descriptors needed for @len bytes of payload.
+ */
+unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len)
+{
+ unsigned int offset = map->offset;
+ int idx = map->frag_idx;
+ unsigned int count = 0;
+
+ if (!len)
+ return 0;
+
+ if (dma_use_iova(&map->iova_state))
+ return 1;
+
+ while (len > 0) {
+ unsigned int region_len, chunk;
+
+ if (idx == -1)
+ region_len = map->linear_len;
+ else
+ region_len = map->frags[idx].len;
+
+ chunk = min(len, region_len - offset);
+ len -= chunk;
+ count++;
+ offset = 0;
+ idx++;
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(tso_dma_map_count);
+
+/**
+ * tso_dma_map_next - yield the next DMA address range
+ * @map: the payload map
+ * @addr: output DMA address
+ * @chunk_len: output chunk length
+ * @mapping_len: full DMA mapping length when this chunk starts a new
+ * mapping region, or 0 when continuing a previous one.
+ * On the IOVA path this is always 0 (driver must not
+ * do per-region unmaps; use tso_dma_map_cleanup instead).
+ * @seg_remaining: bytes left in current segment
+ *
+ * Yields the next (dma_addr, chunk_len) pair and advances the iterator.
+ * On the IOVA path, the entire payload is contiguous so each segment
+ * is always a single chunk.
+ *
+ * Return: true if a chunk was yielded, false when @seg_remaining is 0.
+ */
+bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr,
+ unsigned int *chunk_len, unsigned int *mapping_len,
+ unsigned int seg_remaining)
+{
+ unsigned int region_len, chunk;
+
+ if (!seg_remaining)
+ return false;
+
+ /* IOVA path: contiguous DMA range, no region boundaries */
+ if (dma_use_iova(&map->iova_state)) {
+ *addr = map->iova_state.addr + map->iova_offset;
+ *chunk_len = seg_remaining;
+ *mapping_len = 0;
+ map->iova_offset += seg_remaining;
+ return true;
+ }
+
+ /* Fallback path: per-region iteration */
+
+ if (map->frag_idx == -1) {
+ region_len = map->linear_len;
+ chunk = min(seg_remaining, region_len - map->offset);
+ *addr = map->linear_dma + map->offset;
+ } else {
+ region_len = map->frags[map->frag_idx].len;
+ chunk = min(seg_remaining, region_len - map->offset);
+ *addr = map->frags[map->frag_idx].dma + map->offset;
+ }
+
+ *mapping_len = (map->offset == 0) ? region_len : 0;
+ *chunk_len = chunk;
+ map->offset += chunk;
+
+ if (map->offset >= region_len) {
+ map->frag_idx++;
+ map->offset = 0;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(tso_dma_map_next);
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index d84e955ac87b..85ca4d1ecf9e 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -45,6 +45,7 @@ TEST_PROGS = \
rss_input_xfrm.py \
toeplitz.py \
tso.py \
+ uso.py \
xdp_metadata.py \
xsk_reconfig.py \
#
diff --git a/tools/testing/selftests/drivers/net/hw/uso.py b/tools/testing/selftests/drivers/net/hw/uso.py
new file mode 100755
index 000000000000..6d61e56cab3c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/uso.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""Test USO
+
+Sends large UDP datagrams with UDP_SEGMENT and verifies that the peer
+receives the expected total payload and that the NIC transmitted at least
+the expected number of segments.
+"""
+import random
+import socket
+import string
+
+from lib.py import ksft_run, ksft_exit, KsftSkipEx
+from lib.py import ksft_eq, ksft_ge, ksft_variants, KsftNamedVariant
+from lib.py import NetDrvEpEnv
+from lib.py import bkg, defer, ethtool, ip, rand_port, wait_port_listen
+
+# python doesn't expose this constant, so we need to hardcode it to enable UDP
+# segmentation for large payloads
+UDP_SEGMENT = 103
+
+
+def _send_uso(cfg, ipver, mss, total_payload, port):
+ if ipver == "4":
+ sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ dst = (cfg.remote_addr_v["4"], port)
+ else:
+ sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+ dst = (cfg.remote_addr_v["6"], port)
+
+ sock.setsockopt(socket.IPPROTO_UDP, UDP_SEGMENT, mss)
+ payload = ''.join(random.choice(string.ascii_lowercase)
+ for _ in range(total_payload))
+ sock.sendto(payload.encode(), dst)
+ sock.close()
+
+
+def _get_tx_packets(cfg):
+ stats = ip(f"-s link show dev {cfg.ifname}", json=True)[0]
+ return stats['stats64']['tx']['packets']
+
+
+def _test_uso(cfg, ipver, mss, total_payload):
+ cfg.require_ipver(ipver)
+ cfg.require_cmd("socat", remote=True)
+
+ features = ethtool(f"-k {cfg.ifname}", json=True)
+ uso_was_on = features[0]["tx-udp-segmentation"]["active"]
+
+ try:
+ ethtool(f"-K {cfg.ifname} tx-udp-segmentation on")
+ except Exception as exc:
+ raise KsftSkipEx(
+ "Device does not support tx-udp-segmentation") from exc
+ if not uso_was_on:
+ defer(ethtool, f"-K {cfg.ifname} tx-udp-segmentation off")
+
+ expected_segs = (total_payload + mss - 1) // mss
+
+ port = rand_port(stype=socket.SOCK_DGRAM)
+ rx_cmd = f"socat -{ipver} -T 2 -u UDP-LISTEN:{port},reuseport STDOUT"
+
+ tx_before = _get_tx_packets(cfg)
+
+ with bkg(rx_cmd, host=cfg.remote, exit_wait=True) as rx:
+ wait_port_listen(port, proto="udp", host=cfg.remote)
+ _send_uso(cfg, ipver, mss, total_payload, port)
+
+ ksft_eq(len(rx.stdout), total_payload,
+ comment=f"Received {len(rx.stdout)}B, expected {total_payload}B")
+
+ cfg.wait_hw_stats_settle()
+
+ tx_after = _get_tx_packets(cfg)
+ tx_delta = tx_after - tx_before
+
+ ksft_ge(tx_delta, expected_segs,
+ comment=f"Expected >= {expected_segs} tx packets, got {tx_delta}")
+
+
+def _uso_variants():
+ for ipver in ["4", "6"]:
+ yield KsftNamedVariant(f"v{ipver}_partial", ipver, 1400, 1400 * 10 + 500)
+ yield KsftNamedVariant(f"v{ipver}_exact", ipver, 1400, 1400 * 5)
+
+
+@ksft_variants(_uso_variants())
+def test_uso(cfg, ipver, mss, total_payload):
+ """Send a USO datagram and verify the peer receives the expected segments."""
+ _test_uso(cfg, ipver, mss, total_payload)
+
+
+def main() -> None:
+ """Run USO tests."""
+ with NetDrvEpEnv(__file__) as cfg:
+ ksft_run([test_uso],
+ args=(cfg, ))
+ ksft_exit()
+
+
+if __name__ == "__main__":
+ main()