From 493740d790cce709d285cd1022d16d05439b7d5b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 6 Mar 2026 11:31:54 +0530 Subject: drm/buddy: Improve offset-aligned allocation handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large alignment requests previously forced the buddy allocator to search by alignment order, which often caused higher-order free blocks to be split even when a suitably aligned smaller region already existed within them. This led to excessive fragmentation, especially for workloads requesting small sizes with large alignment constraints. This change prioritizes the requested allocation size during the search and uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate free blocks that satisfy both size and offset-alignment requirements. As a result, the allocator can directly select an aligned sub-region without splitting larger blocks unnecessarily. A practical example is the VKCTS test dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations caused large blocks to be split aggressively, despite smaller aligned regions being sufficient. With this change, those aligned regions are reused directly, significantly reducing fragmentation. This improvement is visible in the amdgpu VRAM buddy allocator state (/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks are preserved and the number of low-order fragments is substantially reduced. Before: order- 5 free: 1936 MiB, blocks: 15490 order- 4 free: 967 MiB, blocks: 15486 order- 3 free: 483 MiB, blocks: 15485 order- 2 free: 241 MiB, blocks: 15486 order- 1 free: 241 MiB, blocks: 30948 After: order- 5 free: 493 MiB, blocks: 3941 order- 4 free: 246 MiB, blocks: 3943 order- 3 free: 123 MiB, blocks: 4101 order- 2 free: 61 MiB, blocks: 4101 order- 1 free: 61 MiB, blocks: 8018 By avoiding unnecessary splits, this change improves allocator efficiency and helps maintain larger contiguous free regions under heavy offset-aligned allocation workloads. v2:(Matthew) - Update augmented information along the path to the inserted node. v3: - Move the patch to gpu/buddy.c file. v4:(Matthew) - Use the helper instead of calling _ffs directly - Remove gpu_buddy_block_order(block) >= order check and drop order - Drop !node check as all callers handle this already - Return larger than any other possible alignment for __ffs64(0) - Replace __ffs with __ffs64 v5:(Matthew) - Drop subtree_max_alignment initialization at gpu_block_alloc() Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com --- include/linux/gpu_buddy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index f1fb6eff604a..5fa917ba5450 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range @@ -128,6 +129,7 @@ struct gpu_buddy_block { }; /* private: */ struct list_head tmp_link; + unsigned int subtree_max_alignment; }; /* Order-zero must be at least SZ_4K */ -- cgit v1.2.3 From 878004e2852bc22ce0687c5597d6fe3909fb59f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Mar 2026 14:10:32 -0800 Subject: iopoll: fix function parameter names in read_poll_timeout_atomic() Correct the function parameter names to avoid kernel-doc warnings and to emphasize this function is atomic (non-sleeping). Warning: include/linux/iopoll.h:169 function parameter 'sleep_us' not described in 'read_poll_timeout_atomic' Warning: ../include/linux/iopoll.h:169 function parameter 'sleep_before_read' not described in 'read_poll_timeout_atomic' Fixes: 9df8043a546d ("iopoll: Generalize read_poll_timeout() into poll_timeout_us()") Signed-off-by: Randy Dunlap Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260306221033.2357305-1-rdunlap@infradead.org Signed-off-by: Jani Nikula --- include/linux/iopoll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index bdd2e0652bc3..53edd69acb9b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -159,7 +159,7 @@ * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock - * time, which is rather minimal with a non-zero delay_us. + * time, which is rather minimal with a non-zero @delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. @@ -167,9 +167,9 @@ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. */ -#define read_poll_timeout_atomic(op, val, cond, sleep_us, timeout_us, \ - sleep_before_read, args...) \ - poll_timeout_us_atomic((val) = op(args), cond, sleep_us, timeout_us, sleep_before_read) +#define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ + delay_before_read, args...) \ + poll_timeout_us_atomic((val) = op(args), cond, delay_us, timeout_us, delay_before_read) /** * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs -- cgit v1.2.3