summaryrefslogtreecommitdiff
path: root/net/xdp/xsk_buff_pool.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 16:20:58 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 16:20:58 +0300
commitfc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4 (patch)
tree93b16bc48fdc3be4a1adccbf4c7de92a5e8440e1 /net/xdp/xsk_buff_pool.c
parentbfc484fe6abba4b89ec9330e0e68778e2a9856b2 (diff)
parent84882cf72cd774cf16fd338bdbf00f69ac9f9194 (diff)
downloadlinux-fc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4.tar.xz
Merge tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Remove socket skb caches - Add a SO_RESERVE_MEM socket op to forward allocate buffer space and avoid memory accounting overhead on each message sent - Introduce managed neighbor entries - added by control plane and resolved by the kernel for use in acceleration paths (BPF / XDP right now, HW offload users will benefit as well) - Make neighbor eviction on link down controllable by userspace to work around WiFi networks with bad roaming implementations - vrf: Rework interaction with netfilter/conntrack - fq_codel: implement L4S style ce_threshold_ect1 marking - sch: Eliminate unnecessary RCU waits in mini_qdisc_pair_swap() BPF: - Add support for new btf kind BTF_KIND_TAG, arbitrary type tagging as implemented in LLVM14 - Introduce bpf_get_branch_snapshot() to capture Last Branch Records - Implement variadic trace_printk helper - Add a new Bloomfilter map type - Track <8-byte scalar spill and refill - Access hw timestamp through BPF's __sk_buff - Disallow unprivileged BPF by default - Document BPF licensing Netfilter: - Introduce egress hook for looking at raw outgoing packets - Allow matching on and modifying inner headers / payload data - Add NFT_META_IFTYPE to match on the interface type either from ingress or egress Protocols: - Multi-Path TCP: - increase default max additional subflows to 2 - rework forward memory allocation - add getsockopts: MPTCP_INFO, MPTCP_TCPINFO, MPTCP_SUBFLOW_ADDRS - MCTP flow support allowing lower layer drivers to configure msg muxing as needed - Automatic Multicast Tunneling (AMT) driver based on RFC7450 - HSR support the redbox supervision frames (IEC-62439-3:2018) - Support for the ip6ip6 encapsulation of IOAM - Netlink interface for CAN-FD's Transmitter Delay Compensation - Support SMC-Rv2 eliminating the current same-subnet restriction, by exploiting the UDP encapsulation feature of RoCE adapters - TLS: add SM4 GCM/CCM crypto support - Bluetooth: initial support for link quality and audio/codec offload Driver APIs: - Add a batched interface for RX buffer allocation in AF_XDP buffer pool - ethtool: Add ability to control transceiver modules' power mode - phy: Introduce supported interfaces bitmap to express MAC capabilities and simplify PHY code - Drop rtnl_lock from DSA .port_fdb_{add,del} callbacks New drivers: - WiFi driver for Realtek 8852AE 802.11ax devices (rtw89) - Ethernet driver for ASIX AX88796C SPI device (x88796c) Drivers: - Broadcom PHYs - support 72165, 7712 16nm PHYs - support IDDQ-SR for additional power savings - PHY support for QCA8081, QCA9561 PHYs - NXP DPAA2: support for IRQ coalescing - NXP Ethernet (enetc): support for software TCP segmentation - Renesas Ethernet (ravb) - support DMAC and EMAC blocks of Gigabit-capable IP found on RZ/G2L SoC - Intel 100G Ethernet - support for eswitch offload of TC/OvS flow API, including offload of GRE, VxLAN, Geneve tunneling - support application device queues - ability to assign Rx and Tx queues to application threads - PTP and PPS (pulse-per-second) extensions - Broadcom Ethernet (bnxt) - devlink health reporting and device reload extensions - Mellanox Ethernet (mlx5) - offload macvlan interfaces - support HW offload of TC rules involving OVS internal ports - support HW-GRO and header/data split - support application device queues - Marvell OcteonTx2: - add XDP support for PF - add PTP support for VF - Qualcomm Ethernet switch (qca8k): support for QCA8328 - Realtek Ethernet DSA switch (rtl8366rb) - support bridge offload - support STP, fast aging, disabling address learning - support for Realtek RTL8365MB-VC, a 4+1 port 10M/100M/1GE switch - Mellanox Ethernet/IB switch (mlxsw) - multi-level qdisc hierarchy offload (e.g. RED, prio and shaping) - offload root TBF qdisc as port shaper - support multiple routing interface MAC address prefixes - support for IP-in-IP with IPv6 underlay - MediaTek WiFi (mt76) - mt7921 - ASPM, 6GHz, SDIO and testmode support - mt7915 - LED and TWT support - Qualcomm WiFi (ath11k) - include channel rx and tx time in survey dump statistics - support for 80P80 and 160 MHz bandwidths - support channel 2 in 6 GHz band - spectral scan support for QCN9074 - support for rx decapsulation offload (data frames in 802.3 format) - Qualcomm phone SoC WiFi (wcn36xx) - enable Idle Mode Power Save (IMPS) to reduce power consumption during idle - Bluetooth driver support for MediaTek MT7922 and MT7921 - Enable support for AOSP Bluetooth extension in Qualcomm WCN399x and Realtek 8822C/8852A - Microsoft vNIC driver (mana) - support hibernation and kexec - Google vNIC driver (gve) - support for jumbo frames - implement Rx page reuse Refactor: - Make all writes to netdev->dev_addr go thru helpers, so that we can add this address to the address rbtree and handle the updates - Various TCP cleanups and optimizations including improvements to CPU cache use - Simplify the gnet_stats, Qdisc stats' handling and remove qdisc->running sequence counter - Driver changes and API updates to address devlink locking deficiencies" * tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2122 commits) Revert "net: avoid double accounting for pure zerocopy skbs" selftests: net: add arp_ndisc_evict_nocarrier net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter net: arp: introduce arp_evict_nocarrier sysctl parameter libbpf: Deprecate AF_XDP support kbuild: Unify options for BTF generation for vmlinux and modules selftests/bpf: Add a testcase for 64-bit bounds propagation issue. bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit. bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off. net: vmxnet3: remove multiple false checks in vmxnet3_ethtool.c net: avoid double accounting for pure zerocopy skbs tcp: rename sk_wmem_free_skb netdevsim: fix uninit value in nsim_drv_configure_vfs() selftests/bpf: Fix also no-alu32 strobemeta selftest bpf: Add missing map_delete_elem method to bloom filter map selftests/bpf: Add bloom map success test for userspace calls bpf: Add alignment padding for "map_extra" + consolidate holes bpf: Bloom filter map naming fixups selftests/bpf: Add test cases for struct_ops prog bpf: Add dummy BPF STRUCT_OPS for test purpose ...
Diffstat (limited to 'net/xdp/xsk_buff_pool.c')
-rw-r--r--net/xdp/xsk_buff_pool.c132
1 files changed, 115 insertions, 17 deletions
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 8de01aaac4a0..90c4e1e819d3 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -44,12 +44,13 @@ void xp_destroy(struct xsk_buff_pool *pool)
struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
struct xdp_umem *umem)
{
+ bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
struct xsk_buff_pool *pool;
struct xdp_buff_xsk *xskb;
- u32 i;
+ u32 i, entries;
- pool = kvzalloc(struct_size(pool, free_heads, umem->chunks),
- GFP_KERNEL);
+ entries = unaligned ? umem->chunks : 0;
+ pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL);
if (!pool)
goto out;
@@ -63,7 +64,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
pool->free_heads_cnt = umem->chunks;
pool->headroom = umem->headroom;
pool->chunk_size = umem->chunk_size;
- pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ pool->chunk_shift = ffs(umem->chunk_size) - 1;
+ pool->unaligned = unaligned;
pool->frame_len = umem->chunk_size - umem->headroom -
XDP_PACKET_HEADROOM;
pool->umem = umem;
@@ -81,7 +83,10 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
xskb = &pool->heads[i];
xskb->pool = pool;
xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
- pool->free_heads[i] = xskb;
+ if (pool->unaligned)
+ pool->free_heads[i] = xskb;
+ else
+ xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
}
return pool;
@@ -406,6 +411,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
if (pool->unaligned)
xp_check_dma_contiguity(dma_map);
+ else
+ for (i = 0; i < pool->heads_cnt; i++) {
+ struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+ xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
+ }
err = xp_init_dma_info(pool, dma_map);
if (err) {
@@ -448,12 +459,9 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
if (pool->free_heads_cnt == 0)
return NULL;
- xskb = pool->free_heads[--pool->free_heads_cnt];
-
for (;;) {
if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
pool->fq->queue_empty_descs++;
- xp_release(xskb);
return NULL;
}
@@ -466,17 +474,17 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
}
break;
}
- xskq_cons_release(pool->fq);
- xskb->orig_addr = addr;
- xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
- if (pool->dma_pages_cnt) {
- xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
- ~XSK_NEXT_PG_CONTIG_MASK) +
- (addr & ~PAGE_MASK);
- xskb->dma = xskb->frame_dma + pool->headroom +
- XDP_PACKET_HEADROOM;
+ if (pool->unaligned) {
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+ xp_init_xskb_addr(xskb, pool, addr);
+ if (pool->dma_pages_cnt)
+ xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+ } else {
+ xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
}
+
+ xskq_cons_release(pool->fq);
return xskb;
}
@@ -507,6 +515,96 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
}
EXPORT_SYMBOL(xp_alloc);
+static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+ u32 i, cached_cons, nb_entries;
+
+ if (max > pool->free_heads_cnt)
+ max = pool->free_heads_cnt;
+ max = xskq_cons_nb_entries(pool->fq, max);
+
+ cached_cons = pool->fq->cached_cons;
+ nb_entries = max;
+ i = max;
+ while (i--) {
+ struct xdp_buff_xsk *xskb;
+ u64 addr;
+ bool ok;
+
+ __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr);
+
+ ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
+ xp_check_aligned(pool, &addr);
+ if (unlikely(!ok)) {
+ pool->fq->invalid_descs++;
+ nb_entries--;
+ continue;
+ }
+
+ if (pool->unaligned) {
+ xskb = pool->free_heads[--pool->free_heads_cnt];
+ xp_init_xskb_addr(xskb, pool, addr);
+ if (pool->dma_pages_cnt)
+ xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
+ } else {
+ xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
+ }
+
+ *xdp = &xskb->xdp;
+ xdp++;
+ }
+
+ xskq_cons_release_n(pool->fq, max);
+ return nb_entries;
+}
+
+static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries)
+{
+ struct xdp_buff_xsk *xskb;
+ u32 i;
+
+ nb_entries = min_t(u32, nb_entries, pool->free_list_cnt);
+
+ i = nb_entries;
+ while (i--) {
+ xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node);
+ list_del(&xskb->free_list_node);
+
+ *xdp = &xskb->xdp;
+ xdp++;
+ }
+ pool->free_list_cnt -= nb_entries;
+
+ return nb_entries;
+}
+
+u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
+{
+ u32 nb_entries1 = 0, nb_entries2;
+
+ if (unlikely(pool->dma_need_sync)) {
+ /* Slow path */
+ *xdp = xp_alloc(pool);
+ return !!*xdp;
+ }
+
+ if (unlikely(pool->free_list_cnt)) {
+ nb_entries1 = xp_alloc_reused(pool, xdp, max);
+ if (nb_entries1 == max)
+ return nb_entries1;
+
+ max -= nb_entries1;
+ xdp += nb_entries1;
+ }
+
+ nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max);
+ if (!nb_entries2)
+ pool->fq->queue_empty_descs++;
+
+ return nb_entries1 + nb_entries2;
+}
+EXPORT_SYMBOL(xp_alloc_batch);
+
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
{
if (pool->free_list_cnt >= count)