From 493740d790cce709d285cd1022d16d05439b7d5b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 6 Mar 2026 11:31:54 +0530 Subject: drm/buddy: Improve offset-aligned allocation handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large alignment requests previously forced the buddy allocator to search by alignment order, which often caused higher-order free blocks to be split even when a suitably aligned smaller region already existed within them. This led to excessive fragmentation, especially for workloads requesting small sizes with large alignment constraints. This change prioritizes the requested allocation size during the search and uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate free blocks that satisfy both size and offset-alignment requirements. As a result, the allocator can directly select an aligned sub-region without splitting larger blocks unnecessarily. A practical example is the VKCTS test dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations caused large blocks to be split aggressively, despite smaller aligned regions being sufficient. With this change, those aligned regions are reused directly, significantly reducing fragmentation. This improvement is visible in the amdgpu VRAM buddy allocator state (/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks are preserved and the number of low-order fragments is substantially reduced. Before: order- 5 free: 1936 MiB, blocks: 15490 order- 4 free: 967 MiB, blocks: 15486 order- 3 free: 483 MiB, blocks: 15485 order- 2 free: 241 MiB, blocks: 15486 order- 1 free: 241 MiB, blocks: 30948 After: order- 5 free: 493 MiB, blocks: 3941 order- 4 free: 246 MiB, blocks: 3943 order- 3 free: 123 MiB, blocks: 4101 order- 2 free: 61 MiB, blocks: 4101 order- 1 free: 61 MiB, blocks: 8018 By avoiding unnecessary splits, this change improves allocator efficiency and helps maintain larger contiguous free regions under heavy offset-aligned allocation workloads. v2:(Matthew) - Update augmented information along the path to the inserted node. v3: - Move the patch to gpu/buddy.c file. v4:(Matthew) - Use the helper instead of calling _ffs directly - Remove gpu_buddy_block_order(block) >= order check and drop order - Drop !node check as all callers handle this already - Return larger than any other possible alignment for __ffs64(0) - Replace __ffs with __ffs64 v5:(Matthew) - Drop subtree_max_alignment initialization at gpu_block_alloc() Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com --- include/linux/gpu_buddy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index f1fb6eff604a..5fa917ba5450 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range @@ -128,6 +129,7 @@ struct gpu_buddy_block { }; /* private: */ struct list_head tmp_link; + unsigned int subtree_max_alignment; }; /* Order-zero must be at least SZ_4K */ -- cgit v1.2.3 From f66d6cc6891e41be96380261943837b1909107b3 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 11 Mar 2026 10:18:42 -0700 Subject: accel/amdxdna: Support sensors for column utilization The AMD PMF driver provides realtime column utilization (npu_busy) metrics for the NPU. Extend the DRM_IOCTL_AMDXDNA_GET_INFO sensor query to expose these metrics to userspace. Add AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION to the sensor type enum and update aie2_get_sensors() to return both the total power and up to 8 column utilization sensors if the user buffer permits. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Lizhi Hou [lizhi: support legacy tool which uses small buffer. checkpatch cleanup] Signed-off-by: Lizhi Hou Link: https://patch.msgid.link/20260311171842.473453-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_pci.c | 34 +++++++++++++++++++++++++++++----- drivers/accel/amdxdna/aie2_pci.h | 8 ++++++++ include/uapi/drm/amdxdna_accel.h | 3 ++- 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index a2e586512e26..c57c785a2d15 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -787,16 +787,18 @@ static int aie2_get_clock_metadata(struct amdxdna_client *client, static int aie2_get_sensors(struct amdxdna_client *client, struct amdxdna_drm_get_info *args) { + struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle; struct amdxdna_drm_query_sensor sensor = {}; + struct amd_pmf_npu_metrics npu_metrics; + u32 sensors_count = 0, i; int ret; - if (args->buffer_size < sizeof(sensor)) - return -EINVAL; - - ret = AIE2_GET_PMF_NPU_DATA(npu_power, sensor.input); + ret = AIE2_GET_PMF_NPU_METRICS(&npu_metrics); if (ret) return ret; + sensor.type = AMDXDNA_SENSOR_TYPE_POWER; + sensor.input = npu_metrics.npu_power; sensor.unitm = -3; scnprintf(sensor.label, sizeof(sensor.label), "Total Power"); scnprintf(sensor.units, sizeof(sensor.units), "mW"); @@ -804,7 +806,29 @@ static int aie2_get_sensors(struct amdxdna_client *client, if (copy_to_user(u64_to_user_ptr(args->buffer), &sensor, sizeof(sensor))) return -EFAULT; - args->buffer_size = sizeof(sensor); + sensors_count++; + if (args->buffer_size <= sensors_count * sizeof(sensor)) + goto out; + + for (i = 0; i < min_t(u32, ndev->total_col, 8); i++) { + memset(&sensor, 0, sizeof(sensor)); + sensor.input = npu_metrics.npu_busy[i]; + sensor.type = AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION; + sensor.unitm = 0; + scnprintf(sensor.label, sizeof(sensor.label), "Column %d Utilization", i); + scnprintf(sensor.units, sizeof(sensor.units), "%%"); + + if (copy_to_user(u64_to_user_ptr(args->buffer) + sensors_count * sizeof(sensor), + &sensor, sizeof(sensor))) + return -EFAULT; + + sensors_count++; + if (args->buffer_size <= sensors_count * sizeof(sensor)) + goto out; + } + +out: + args->buffer_size = sensors_count * sizeof(sensor); return 0; } diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h index 1bb88711bedb..0ae174862592 100644 --- a/drivers/accel/amdxdna/aie2_pci.h +++ b/drivers/accel/amdxdna/aie2_pci.h @@ -48,6 +48,7 @@ }) #if IS_ENABLED(CONFIG_AMD_PMF) +#define AIE2_GET_PMF_NPU_METRICS(metrics) amd_pmf_get_npu_data(metrics) #define AIE2_GET_PMF_NPU_DATA(field, val) \ ({ \ struct amd_pmf_npu_metrics _npu_metrics; \ @@ -58,6 +59,13 @@ (_ret); \ }) #else +#define AIE2_GET_PMF_NPU_METRICS(metrics) \ +({ \ + typeof(metrics) _m = metrics; \ + memset(_m, 0xff, sizeof(*_m)); \ + (-EOPNOTSUPP); \ +}) + #define SENSOR_DEFAULT_npu_power U32_MAX #define AIE2_GET_PMF_NPU_DATA(field, val) \ ({ \ diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h index 9c44db2b3dcd..5bd13f4435f5 100644 --- a/include/uapi/drm/amdxdna_accel.h +++ b/include/uapi/drm/amdxdna_accel.h @@ -353,7 +353,8 @@ struct amdxdna_drm_query_clock_metadata { }; enum amdxdna_sensor_type { - AMDXDNA_SENSOR_TYPE_POWER + AMDXDNA_SENSOR_TYPE_POWER, + AMDXDNA_SENSOR_TYPE_COLUMN_UTILIZATION }; /** -- cgit v1.2.3