summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-05-06 05:23:18 +0300
committerJakub Kicinski <kuba@kernel.org>2026-05-06 05:23:18 +0300
commit7e0cccae6b45b12eaf71fc3ab8eb133bb50b28ad (patch)
treeda0d42f2115debfe217cb6edfe6c6dc501837ab7
parent561e066284d14eb7a3ea31bfcb0cc599f6044739 (diff)
parent3af0820c878e2bca77141981b808be9994341654 (diff)
downloadlinux-7e0cccae6b45b12eaf71fc3ab8eb133bb50b28ad.tar.xz
Merge branch 'net-mana-avoid-queue-struct-allocation-failure-under-memory-fragmentation'
Aditya Garg says: ==================== net: mana: Avoid queue struct allocation failure under memory fragmentation The MANA driver can fail to load on systems with high memory utilization because several allocations in the queue setup paths require large physically contiguous blocks via kmalloc. Under memory fragmentation these high-order allocations may fail, preventing the driver from creating queues when opening the interface or when reconfiguring channels, ring parameters or MTU at runtime. Allocation sizes that are problematic: mana_create_txq -> tx_qp flat array (sizeof(mana_tx_qp) = 35528): 16 queues (default): 35528 * 16 = ~555 KB contiguous 64 queues (max): 35528 * 64 = ~2220 KB contiguous mana_create_rxq -> rxq struct with flex array (sizeof(mana_rxq) = 35712, rx_oobs=296 per entry): depth 1024 (default): 35712 + 296 * 1024 = ~331 KB per queue depth 8192 (max): 35712 + 296 * 8192 = ~2403 KB per queue mana_pre_alloc_rxbufs -> rxbufs_pre and das_pre arrays: 16 queues, depth 1024 (default): 16 * 1024 * 8 = 128 KB each 64 queues, depth 8192 (max): 64 * 8192 * 8 = 4096 KB each This series addresses the issue by: 1. Converting the tx_qp flat array into an array of pointers with per-queue kvzalloc (~35 KB each), replacing a single contiguous allocation that can reach ~2.2 MB at 64 queues. 2. Switching rxbufs_pre, das_pre, and rxq allocations to kvmalloc/kvzalloc so the allocator can fall back to vmalloc when contiguous memory is unavailable. Throughput testing confirms no regression. Since kvmalloc falls back to vmalloc under memory fragmentation, all kvmalloc calls were temporarily replaced with vmalloc to simulate the fallback path (iperf3, GBits/sec): Physically contiguous vmalloc region Connections TX RX TX RX -------------------------------------------------------------- 1 47.2 46.9 46.8 46.6 16 181 181 181 181 32 181 181 181 181 64 181 181 181 181 ==================== Link: https://patch.msgid.link/20260502074552.23857-1-gargaditya@linux.microsoft.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/ethernet/microsoft/mana/mana_bpf.c2
-rw-r--r--drivers/net/ethernet/microsoft/mana/mana_en.c61
-rw-r--r--drivers/net/ethernet/microsoft/mana/mana_ethtool.c2
-rw-r--r--include/net/mana/mana.h2
4 files changed, 39 insertions, 28 deletions
diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index 7697c9b52ed3..b5e9bb184a1d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -68,7 +68,7 @@ int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
count++;
}
- tx_stats = &apc->tx_qp[q_idx].txq.stats;
+ tx_stats = &apc->tx_qp[q_idx]->txq.stats;
u64_stats_update_begin(&tx_stats->syncp);
tx_stats->xdp_xmit += count;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 7c83e010a1e6..462a457e7d53 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -355,9 +355,9 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
if (skb_cow_head(skb, MANA_HEADROOM))
goto tx_drop_count;
- txq = &apc->tx_qp[txq_idx].txq;
+ txq = &apc->tx_qp[txq_idx]->txq;
gdma_sq = txq->gdma_sq;
- cq = &apc->tx_qp[txq_idx].tx_cq;
+ cq = &apc->tx_qp[txq_idx]->tx_cq;
tx_stats = &txq->stats;
BUILD_BUG_ON(MAX_TX_WQE_SGL_ENTRIES != MANA_MAX_TX_WQE_SGL_ENTRIES);
@@ -614,7 +614,7 @@ static void mana_get_stats64(struct net_device *ndev,
}
for (q = 0; q < num_queues; q++) {
- tx_stats = &apc->tx_qp[q].txq.stats;
+ tx_stats = &apc->tx_qp[q]->txq.stats;
do {
start = u64_stats_fetch_begin(&tx_stats->syncp);
@@ -685,11 +685,11 @@ void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
}
- kfree(mpc->das_pre);
+ kvfree(mpc->das_pre);
mpc->das_pre = NULL;
out2:
- kfree(mpc->rxbufs_pre);
+ kvfree(mpc->rxbufs_pre);
mpc->rxbufs_pre = NULL;
out1:
@@ -806,11 +806,11 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu
num_rxb = num_queues * mpc->rx_queue_size;
WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
- mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
+ mpc->rxbufs_pre = kvmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
if (!mpc->rxbufs_pre)
goto error;
- mpc->das_pre = kmalloc_objs(dma_addr_t, num_rxb);
+ mpc->das_pre = kvmalloc_objs(dma_addr_t, num_rxb);
if (!mpc->das_pre)
goto error;
@@ -2327,21 +2327,26 @@ static void mana_destroy_txq(struct mana_port_context *apc)
return;
for (i = 0; i < apc->num_queues; i++) {
- debugfs_remove_recursive(apc->tx_qp[i].mana_tx_debugfs);
- apc->tx_qp[i].mana_tx_debugfs = NULL;
+ if (!apc->tx_qp[i])
+ continue;
+
+ debugfs_remove_recursive(apc->tx_qp[i]->mana_tx_debugfs);
+ apc->tx_qp[i]->mana_tx_debugfs = NULL;
- napi = &apc->tx_qp[i].tx_cq.napi;
- if (apc->tx_qp[i].txq.napi_initialized) {
+ napi = &apc->tx_qp[i]->tx_cq.napi;
+ if (apc->tx_qp[i]->txq.napi_initialized) {
napi_synchronize(napi);
napi_disable_locked(napi);
netif_napi_del_locked(napi);
- apc->tx_qp[i].txq.napi_initialized = false;
+ apc->tx_qp[i]->txq.napi_initialized = false;
}
- mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
+ mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i]->tx_object);
- mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq);
+ mana_deinit_cq(apc, &apc->tx_qp[i]->tx_cq);
- mana_deinit_txq(apc, &apc->tx_qp[i].txq);
+ mana_deinit_txq(apc, &apc->tx_qp[i]->txq);
+
+ kvfree(apc->tx_qp[i]);
}
kfree(apc->tx_qp);
@@ -2350,7 +2355,7 @@ static void mana_destroy_txq(struct mana_port_context *apc)
static void mana_create_txq_debugfs(struct mana_port_context *apc, int idx)
{
- struct mana_tx_qp *tx_qp = &apc->tx_qp[idx];
+ struct mana_tx_qp *tx_qp = apc->tx_qp[idx];
char qnum[32];
sprintf(qnum, "TX-%d", idx);
@@ -2389,7 +2394,7 @@ static int mana_create_txq(struct mana_port_context *apc,
int err;
int i;
- apc->tx_qp = kzalloc_objs(struct mana_tx_qp, apc->num_queues);
+ apc->tx_qp = kzalloc_objs(struct mana_tx_qp *, apc->num_queues);
if (!apc->tx_qp)
return -ENOMEM;
@@ -2409,10 +2414,16 @@ static int mana_create_txq(struct mana_port_context *apc,
gc = gd->gdma_context;
for (i = 0; i < apc->num_queues; i++) {
- apc->tx_qp[i].tx_object = INVALID_MANA_HANDLE;
+ apc->tx_qp[i] = kvzalloc_obj(*apc->tx_qp[i]);
+ if (!apc->tx_qp[i]) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ apc->tx_qp[i]->tx_object = INVALID_MANA_HANDLE;
/* Create SQ */
- txq = &apc->tx_qp[i].txq;
+ txq = &apc->tx_qp[i]->txq;
u64_stats_init(&txq->stats.syncp);
txq->ndev = net;
@@ -2430,7 +2441,7 @@ static int mana_create_txq(struct mana_port_context *apc,
goto out;
/* Create SQ's CQ */
- cq = &apc->tx_qp[i].tx_cq;
+ cq = &apc->tx_qp[i]->tx_cq;
cq->type = MANA_CQ_TYPE_TX;
cq->txq = txq;
@@ -2459,7 +2470,7 @@ static int mana_create_txq(struct mana_port_context *apc,
err = mana_create_wq_obj(apc, apc->port_handle, GDMA_SQ,
&wq_spec, &cq_spec,
- &apc->tx_qp[i].tx_object);
+ &apc->tx_qp[i]->tx_object);
if (err)
goto out;
@@ -2559,7 +2570,7 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
if (rxq->gdma_rq)
mana_gd_destroy_queue(gc, rxq->gdma_rq);
- kfree(rxq);
+ kvfree(rxq);
}
static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
@@ -2699,7 +2710,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
gc = gd->gdma_context;
- rxq = kzalloc_flex(*rxq, rx_oobs, apc->rx_queue_size);
+ rxq = kvzalloc_flex(*rxq, rx_oobs, apc->rx_queue_size);
if (!rxq)
return NULL;
@@ -3294,7 +3305,7 @@ static int mana_dealloc_queues(struct net_device *ndev)
*/
for (i = 0; i < apc->num_queues; i++) {
- txq = &apc->tx_qp[i].txq;
+ txq = &apc->tx_qp[i]->txq;
tsleep = 1000;
while (atomic_read(&txq->pending_sends) > 0 &&
time_before(jiffies, timeout)) {
@@ -3313,7 +3324,7 @@ static int mana_dealloc_queues(struct net_device *ndev)
}
for (i = 0; i < apc->num_queues; i++) {
- txq = &apc->tx_qp[i].txq;
+ txq = &apc->tx_qp[i]->txq;
while ((skb = skb_dequeue(&txq->pending_skbs))) {
mana_unmap_skb(skb, apc);
dev_kfree_skb_any(skb);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 6a4b42fe0944..04350973e19e 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -260,7 +260,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
}
for (q = 0; q < num_queues; q++) {
- tx_stats = &apc->tx_qp[q].txq.stats;
+ tx_stats = &apc->tx_qp[q]->txq.stats;
do {
start = u64_stats_fetch_begin(&tx_stats->syncp);
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..aa90a858c8e3 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -507,7 +507,7 @@ struct mana_port_context {
bool tx_shortform_allowed;
u16 tx_vp_offset;
- struct mana_tx_qp *tx_qp;
+ struct mana_tx_qp **tx_qp;
/* Indirection Table for RX & TX. The values are queue indexes */
u32 *indir_table;