From 62a9f5a85b98d6d2d9b5e0d67b2d4e5903bc53ec Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:03 -0800 Subject: mm: introduce clear_pages() and clear_user_pages() Introduce clear_pages(), to be overridden by architectures that support more efficient clearing of consecutive pages. Also introduce clear_user_pages(), however, we will not expect this function to be overridden anytime soon. As we do for clear_user_page(), define clear_user_pages() only if the architecture does not define clear_user_highpage(). That is because if the architecture does define clear_user_highpage(), then it likely needs some flushing magic when clearing user pages or highpages. This means we can get away without defining clear_user_pages(), since, much like its single page sibling, its only potential user is the generic clear_user_highpages() which should instead be using clear_user_highpage(). Link: https://lkml.kernel.org/r/20260107072009.1615991-3-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f0d5be9dc736..d78e294698b0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4198,6 +4198,26 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifndef clear_pages +/** + * clear_pages() - clear a page range for kernel-internal use. + * @addr: start address + * @npages: number of pages + * + * Use clear_user_pages() instead when clearing a page range to be + * mapped to user space. + * + * Does absolutely no exception handling. + */ +static inline void clear_pages(void *addr, unsigned int npages) +{ + do { + clear_page(addr); + addr += PAGE_SIZE; + } while (--npages); +} +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); -- cgit v1.2.3 From 94962b2628e6af2c48be6ebdf9f76add28d60ecc Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Tue, 6 Jan 2026 23:20:08 -0800 Subject: mm: folio_zero_user: clear page ranges Use batch clearing in clear_contig_highpages() instead of clearing a single page at a time. Exposing larger ranges enables the processor to optimize based on extent. To do this we just switch to using clear_user_highpages() which would in turn use clear_user_pages() or clear_pages(). Batched clearing, when running under non-preemptible models, however, has latency considerations. In particular, we need periodic invocations of cond_resched() to keep to reasonable preemption latencies. This is a problem because the clearing primitives do not, or might not be able to, call cond_resched() to check if preemption is needed. So, limit the worst case preemption latency by doing the clearing in units of no more than PROCESS_PAGES_NON_PREEMPT_BATCH pages. (Preemptible models already define away most of cond_resched(), so the batch size is ignored when running under those.) PROCESS_PAGES_NON_PREEMPT_BATCH: for architectures with "fast" clear-pages (ones that define clear_pages()), we define it as 32MB worth of pages. This is meant to be large enough to allow the processor to optimize the operation and yet small enough that we see reasonable preemption latency for when this optimization is not possible (ex. slow microarchitectures, memory bandwidth saturation.) This specific value also allows for a cacheline allocation elision optimization (which might help unrelated applications by not evicting potentially useful cache lines) that kicks in recent generations of AMD Zen processors at around LLC-size (32MB is a typical size). At the same time 32MB is small enough that even with poor clearing bandwidth (say ~10GBps), time to clear 32MB should be well below the scheduler's default warning threshold (sysctl_resched_latency_warn_ms=100). "Slow" architectures (don't have clear_pages()) will continue to use the base value (single page). Performance == Testing a demand fault workload shows a decent improvement in bandwidth with pg-sz=1GB. Bandwidth with pg-sz=2MB stays flat. $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 23.58 +- 1.95% 25.34 +- 1.18% + 7.50% preempt=* pg-sz=1GB 25.09 +- 0.79% 39.22 +- 2.32% + 56.31% preempt=none|voluntary pg-sz=1GB 25.71 +- 0.03% 52.73 +- 0.20% [#] +110.16% preempt=full|lazy [#] We perform much better with preempt=full|lazy because, not needing explicit invocations of cond_resched() we can clear the full extent (pg-sz=1GB) as a single unit which the processor can optimize for. (Unless otherwise noted, all numbers are on AMD Genoa (EPYC 9J13); region-size=64GB, local node; 2.56 GHz, boost=0.) Analysis == pg-sz=1GB: the improvement we see falls in two buckets depending on the batch size in use. For batch-size=32MB the number of cachelines allocated (L1-dcache-loads) -- which stay relatively flat for smaller batches, start to drop off because cacheline allocation elision kicks in. And as can be seen below, at batch-size=1GB, we stop allocating cachelines almost entirely. (Not visible here but from testing with intermediate sizes, the allocation change kicks in only at batch-size=32MB and ramps up from there.) contigous-pages 6,949,417,798 L1-dcache-loads # 883.599 M/sec ( +- 0.01% ) (35.75%) 3,226,709,573 L1-dcache-load-misses # 46.43% of all L1-dcache accesses ( +- 0.05% ) (35.75%) batched,32MB 2,290,365,772 L1-dcache-loads # 471.171 M/sec ( +- 0.36% ) (35.72%) 1,144,426,272 L1-dcache-load-misses # 49.97% of all L1-dcache accesses ( +- 0.58% ) (35.70%) batched,1GB 63,914,157 L1-dcache-loads # 17.464 M/sec ( +- 8.08% ) (35.73%) 22,074,367 L1-dcache-load-misses # 34.54% of all L1-dcache accesses ( +- 16.70% ) (35.70%) The dropoff is also visible in L2 prefetch hits (miss numbers are on similar lines): contiguous-pages 3,464,861,312 l2_pf_hit_l2.all # 437.722 M/sec ( +- 0.74% ) (15.69%) batched,32MB 883,750,087 l2_pf_hit_l2.all # 181.223 M/sec ( +- 1.18% ) (15.71%) batched,1GB 8,967,943 l2_pf_hit_l2.all # 2.450 M/sec ( +- 17.92% ) (15.77%) This largely decouples the frontend from the backend since the clearing operation does not need to wait on loads from memory (we still need cacheline ownership but that's a shorter path). This is most visible if we rerun the test above with (boost=1, 3.66 GHz). $ perf bench mem mmap -p $pg-sz -f demand -s 64GB -l 5 contiguous-pages batched-pages (GBps +- %stdev) (GBps +- %stdev) pg-sz=2MB 26.08 +- 1.72% 26.13 +- 0.92% - preempt=* pg-sz=1GB 26.99 +- 0.62% 48.85 +- 2.19% + 80.99% preempt=none|voluntary pg-sz=1GB 27.69 +- 0.18% 75.18 +- 0.25% +171.50% preempt=full|lazy Comparing the batched-pages numbers from the boost=0 ones and these: for a clock-speed gain of 42% we gain 24.5% for batch-size=32MB and 42.5% for batch-size=1GB. In comparison the baseline contiguous-pages case and both the pg-sz=2MB ones are largely backend bound so gain no more than ~10%. Other platforms tested, Intel Icelakex (Oracle X9) and ARM64 Neoverse-N1 (Ampere Altra) both show an improvement of ~35% for pg-sz=2MB|1GB. The first goes from around 8GBps to 11GBps and the second from 32GBps to 44 GBPs. [ankur.a.arora@oracle.com: move the unit computation and make it a const Link: https://lkml.kernel.org/r/20260108060406.1693853-1-ankur.a.arora@oracle.com Link: https://lkml.kernel.org/r/20260107072009.1615991-8-ankur.a.arora@oracle.com Signed-off-by: Ankur Arora Acked-by: David Hildenbrand (Red Hat) Cc: Andy Lutomirski Cc: "Borislav Petkov (AMD)" Cc: Boris Ostrovsky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzessutek Wilk Cc: Lance Yang Cc: "Liam R. Howlett" Cc: Li Zhe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 35 +++++++++++++++++++++++++++++++++++ mm/memory.c | 18 +++++++++++++++--- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d78e294698b0..ab2e7e30aef9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4208,6 +4208,15 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * mapped to user space. * * Does absolutely no exception handling. + * + * Note that even though the clearing operation is preemptible, clear_pages() + * does not (and on architectures where it reduces to a few long-running + * instructions, might not be able to) call cond_resched() to check if + * rescheduling is required. + * + * When running under preemptible models this is not a problem. Under + * cooperatively scheduled models, however, the caller is expected to + * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH. */ static inline void clear_pages(void *addr, unsigned int npages) { @@ -4218,6 +4227,32 @@ static inline void clear_pages(void *addr, unsigned int npages) } #endif +#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH +#ifdef clear_pages +/* + * The architecture defines clear_pages(), and we assume that it is + * generally "fast". So choose a batch size large enough to allow the processor + * headroom for optimizing the operation and yet small enough that we see + * reasonable preemption latency for when this optimization is not possible + * (ex. slow microarchitectures, memory bandwidth saturation.) + * + * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should + * result in worst case preemption latency of around 3ms when clearing pages. + * + * (See comment above clear_pages() for why preemption latency is a concern + * here.) + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT) +#else /* !clear_pages */ +/* + * The architecture does not provide a clear_pages() implementation. Assume + * that clear_page() -- which clear_pages() will fallback to -- is relatively + * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH. + */ +#define PROCESS_PAGES_NON_PREEMPT_BATCH 1 +#endif +#endif + #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); diff --git a/mm/memory.c b/mm/memory.c index 74d663943ecb..3f6ec897c9a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7243,13 +7243,25 @@ static inline int process_huge_page( static void clear_contig_highpages(struct page *page, unsigned long addr, unsigned int nr_pages) { - unsigned int i; + unsigned int i, count; + /* + * When clearing we want to operate on the largest extent possible to + * allow for architecture specific extent based optimizations. + * + * However, since clear_user_highpages() (and primitives clear_user_pages(), + * clear_pages()), do not call cond_resched(), limit the unit size when + * running under non-preemptible scheduling models. + */ + const unsigned int unit = preempt_model_preemptible() ? + nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH; might_sleep(); - for (i = 0; i < nr_pages; i++) { + + for (i = 0; i < nr_pages; i += count) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + count = min(unit, nr_pages - i); + clear_user_highpages(page + i, addr + i * PAGE_SIZE, count); } } -- cgit v1.2.3 From ba1c86874e25e95de9b253570bb50cc3b5df542e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:35 +0200 Subject: alpha: introduce arch_zone_limits_init() Patch series "arch, mm: consolidate hugetlb early reservation", v3. Order in which early memory reservation for hugetlb happens depends on architecture, on configuration options and on command line parameters. Some architectures rely on the core MM to call hugetlb_bootmem_alloc() while others call it very early to allow pre-allocation of HVO-style vmemmap. When hugetlb_cma is supported by an architecture it is initialized during setup_arch() and then later hugetlb_init code needs to understand did it happen or not. To make everything consistent and unified, both reservation of hugetlb memory from bootmem and creation of CMA areas for hugetlb must be called from core MM initialization and it would have been a simple change. However, HVO-style pre-initialization ordering requirements slightly complicate things and for HVO pre-init to work sparse and memory map should be initialized after hugetlb reservations. This required pulling out the call to free_area_init() out of setup_arch() path and moving it MM initialization and this is what the first 23 patches do. These changes are deliberately split into per-arch patches that change how the zone limits are calculated for each architecture and the patches 22 and 23 just remove the calls to free_area_init() and sprase_init() from arch/*. Patch 24 is a simple cleanup for MIPS. Patches 25 and 26 actually consolidate hugetlb reservations and patches 27 and 28 perform some aftermath cleanups. This patch (of 29): Move calculations of zone limits to a dedicated arch_zone_limits_init() function. Later MM core will use this function as an architecture specific callback during nodes and zones initialization and thus there won't be a need to call free_area_init() from every architecture. Link: https://lkml.kernel.org/r/20260111082105.290734-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260111082105.290734-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Magnus Lindholm Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 15 ++++++++++----- include/linux/mm.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 4c5ab9cd8a0a..cd0cb1abde5f 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -208,12 +208,8 @@ callback_init(void * kernel_end) return kernel_end; } -/* - * paging_init() sets up the memory map. - */ -void __init paging_init(void) +void __init arch_zone_limits_init(unsigned long *max_zone_pfn) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; unsigned long dma_pfn; dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; @@ -221,8 +217,17 @@ void __init paging_init(void) max_zone_pfn[ZONE_DMA] = dma_pfn; max_zone_pfn[ZONE_NORMAL] = max_pfn; +} + +/* + * paging_init() sets up the memory map. + */ +void __init paging_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; /* Initialize mem_map[]. */ + arch_zone_limits_init(max_zone_pfn); free_area_init(max_zone_pfn); /* Initialize the kernel's ZERO_PGE. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ab2e7e30aef9..477339b7a032 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3556,6 +3556,7 @@ static inline unsigned long get_num_physpages(void) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); +void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From d49004c5f0c140bb83c87fab46dcf449cf00eb24 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 11 Jan 2026 10:20:57 +0200 Subject: arch, mm: consolidate initialization of nodes, zones and memory map To initialize node, zone and memory map data structures every architecture calls free_area_init() during setup_arch() and passes it an array of zone limits. Beside code duplication it creates "interesting" ordering cases between allocation and initialization of hugetlb and the memory map. Some architectures allocate hugetlb pages very early in setup_arch() in certain cases, some only create hugetlb CMA areas in setup_arch() and sometimes hugetlb allocations happen mm_core_init(). With arch_zone_limits_init() helper available now on all architectures it is no longer necessary to call free_area_init() from architecture setup code. Rather core MM initialization can call arch_zone_limits_init() in a single place. This allows to unify ordering of hugetlb vs memory map allocation and initialization. Remove the call to free_area_init() from architecture specific code and place it in a new mm_core_init_early() function that is called immediately after setup_arch(). After this refactoring it is possible to consolidate hugetlb allocations and eliminate differences in ordering of hugetlb and memory map initialization among different architectures. As the first step of this consolidation move hugetlb_bootmem_alloc() to mm_core_early_init(). Link: https://lkml.kernel.org/r/20260111082105.290734-24-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alex Shi Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David Hildenbrand Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Pratyush Yadav Cc: Richard Weinberger Cc: "Ritesh Harjani (IBM)" Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 9 +-------- arch/arc/mm/init.c | 5 ----- arch/arm/mm/init.c | 16 ---------------- arch/arm64/mm/init.c | 4 ---- arch/csky/kernel/setup.c | 4 ---- arch/hexagon/mm/init.c | 12 ------------ arch/loongarch/include/asm/pgtable.h | 2 -- arch/loongarch/kernel/setup.c | 2 -- arch/loongarch/mm/init.c | 8 -------- arch/m68k/mm/init.c | 3 --- arch/m68k/mm/mcfmmu.c | 3 --- arch/m68k/mm/motorola.c | 6 +----- arch/m68k/mm/sun3mmu.c | 9 --------- arch/microblaze/mm/init.c | 7 ------- arch/mips/loongson64/numa.c | 4 ---- arch/mips/mm/init.c | 5 ----- arch/mips/sgi-ip27/ip27-memory.c | 4 ---- arch/nios2/mm/init.c | 6 ------ arch/openrisc/mm/init.c | 10 ---------- arch/parisc/mm/init.c | 9 --------- arch/powerpc/mm/mem.c | 4 ---- arch/riscv/mm/init.c | 9 --------- arch/s390/mm/init.c | 5 ----- arch/sh/mm/init.c | 5 ----- arch/sparc/mm/init_64.c | 11 ----------- arch/sparc/mm/srmmu.c | 7 ------- arch/um/kernel/mem.c | 5 ----- arch/x86/mm/init.c | 10 ---------- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- arch/x86/mm/mm_internal.h | 1 - arch/xtensa/mm/init.c | 4 ---- include/linux/mm.h | 4 ++-- init/main.c | 1 + mm/mm_init.c | 18 ++++++++++-------- 35 files changed, 15 insertions(+), 200 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index cd0cb1abde5f..9531cbc761c0 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -220,17 +220,10 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) } /* - * paging_init() sets up the memory map. + * paging_init() initializes the kernel's ZERO_PGE. */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - /* Initialize mem_map[]. */ - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - - /* Initialize the kernel's ZERO_PGE. */ memset(absolute_pointer(ZERO_PGE), 0, PAGE_SIZE); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index ff7974d38011..a5e92f46e5d1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -102,8 +102,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) */ void __init setup_arch_memory(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - setup_initial_init_mm(_text, _etext, _edata, _end); /* first page of system - kernel .vector starts here */ @@ -158,9 +156,6 @@ void __init setup_arch_memory(void) arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); #endif /* CONFIG_HIGHMEM */ - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init arch_mm_preinit(void) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index bdcc3639681f..a8f7b4084715 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -118,15 +118,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfn) #endif } -static void __init zone_sizes_init(unsigned long min, unsigned long max_low, - unsigned long max_high) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - #ifdef CONFIG_HAVE_ARCH_PFN_VALID int pfn_valid(unsigned long pfn) { @@ -222,13 +213,6 @@ void __init bootmem_init(void) * done after the fixed reservations */ sparse_init(); - - /* - * Now free the memory - free_area_init needs - * the sparse mem_map arrays initialized by sparse_init() - * for memmap_init_zone(), otherwise all PFNs are invalid. - */ - zone_sizes_init(min_low_pfn, max_low_pfn, max_pfn); } /* diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 06815d34cc11..3641e88ea871 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -134,7 +134,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) static void __init dma_limits_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; phys_addr_t __maybe_unused acpi_zone_dma_limit; phys_addr_t __maybe_unused dt_zone_dma_limit; phys_addr_t __maybe_unused dma32_phys_limit = @@ -160,9 +159,6 @@ static void __init dma_limits_init(void) #endif if (!arm64_dma_phys_limit) arm64_dma_phys_limit = PHYS_MASK + 1; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } int pfn_is_map_memory(unsigned long pfn) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 8968815d93e6..4bf3c01ead3a 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -63,7 +63,6 @@ static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET); - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; signed long size; memblock_reserve(__pa(_start), _end - _start); @@ -101,9 +100,6 @@ static void __init csky_memblock_init(void) memblock_set_current_limit(PFN_PHYS(max_low_pfn)); dma_contiguous_reserve(0); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } void __init setup_arch(char **cmdline_p) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index e2c9487d8d34..07086dbd33fd 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -66,20 +66,8 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -/* - * In order to set up page allocator "nodes", - * somebody has to call free_area_init() for UMA. - * - * In this mode, we only have one pg_data_t - * structure: contig_mem_data. - */ static void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ - /* * Set the init_mm descriptors "context" value to point to the * initial kernel segment table's physical address. diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index f41a648a3d9e..c33b3bcb733e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -353,8 +353,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } -extern void paging_init(void); - #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) #define pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_no_exec(pte) (pte_val(pte) & _PAGE_NO_EXEC) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 20cb6f306456..708ac025db71 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -621,8 +621,6 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #endif - paging_init(); - #ifdef CONFIG_KASAN kasan_init(); #endif diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 17235f87eafb..c331bf69d2ec 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -68,14 +68,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - void __ref free_initmem(void) { free_initmem_default(POISON_FREE_INITMEM); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 6b1d9d2434b5..53b71f786c27 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -69,13 +69,10 @@ void __init paging_init(void) * page_alloc get different views of the world. */ unsigned long end_mem = memory_end & PAGE_MASK; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; high_memory = (void *) end_mem; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 24a6f7bbd1ce..3418fd864237 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -39,7 +39,6 @@ void __init paging_init(void) pte_t *pg_table; unsigned long address, size; unsigned long next_pgtable; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -73,8 +72,6 @@ void __init paging_init(void) } current->mm = NULL; - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } int cf_tlb_miss(struct pt_regs *regs, int write, int dtlb, int extension_word) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d6ccd23caf61..127a3fa69f4c 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -429,7 +429,6 @@ DECLARE_VM_GET_PAGE_PROT */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long min_addr, max_addr; unsigned long addr; int i; @@ -511,12 +510,9 @@ void __init paging_init(void) set_fc(USER_DATA); #ifdef DEBUG - printk ("before free_area_init\n"); + printk ("before node_set_state\n"); #endif for (i = 0; i < m68k_num_memory; i++) if (node_present_pages(i)) node_set_state(i, N_NORMAL_MEMORY); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index fdd69cc4240c..c801677f7df8 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -41,7 +41,6 @@ void __init paging_init(void) unsigned long address; unsigned long next_pgtable; unsigned long bootmem_end; - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); @@ -80,14 +79,6 @@ void __init paging_init(void) mmu_emu_init(bootmem_end); current->mm = NULL; - - /* memory sizing is a hack stolen from motorola.c.. hope it works for us */ - arch_zone_limits_init(max_zone_pfn); - - /* I really wish I knew why the following change made things better... -- Sam */ - free_area_init(max_zone_pfn); - - } static const pgprot_t protection_map[16] = { diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 54da60b81094..848cdee1380c 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -69,22 +69,15 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ static void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES]; int idx; /* Setup fixmaps */ for (idx = 0; idx < __end_of_fixed_addresses; idx++) clear_fixmap(idx); - /* Clean every zones */ - memset(zones_size, 0, sizeof(zones_size)); - #ifdef CONFIG_HIGHMEM highmem_init(); #endif - arch_zone_limits_init(zones_size); - /* We don't have holes in memory map */ - free_area_init(zones_size); } void __init setup_memory(void) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index f72a58f87878..2cd95020df08 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -162,11 +162,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } /* All PCI device belongs to logical Node-0 */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 269bf6335ac4..2575cba856d3 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -417,12 +417,7 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - pagetable_init(); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } #ifdef CONFIG_64BIT diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index babeb0e07687..082651facf4f 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -413,9 +413,5 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - pagetable_init(); - arch_zone_limits_init(zones_size); - free_area_init(zones_size); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 2cb666a65d9e..6b22f1995c16 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -51,15 +51,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - pagetable_init(); pgd_current = swapper_pg_dir; - arch_zone_limits_init(max_zone_pfn); - /* pass the memory from the bootmem allocator to the main allocator */ - free_area_init(max_zone_pfn); - flush_dcache_range((unsigned long)empty_zero_page, (unsigned long)empty_zero_page + PAGE_SIZE); } diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 67de93e7a685..78fb0734cdbc 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -47,14 +47,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - extern const char _s_kernel_ro[], _e_kernel_ro[]; /* @@ -145,8 +137,6 @@ void __init paging_init(void) map_ram(); - zone_sizes_init(); - /* self modifying code ;) */ /* Since the old TLB miss handler has been running up until now, * the kernel pages are still all RW, so we can still modify the diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index dc5bd3efe738..ce6f09ab7a90 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -698,14 +698,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = PFN_DOWN(memblock_end_of_DRAM()); } -static void __init parisc_bootmem_free(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); -} - void __init paging_init(void) { setup_bootmem(); @@ -716,7 +708,6 @@ void __init paging_init(void) flush_tlb_all_local(NULL); sparse_init(); - parisc_bootmem_free(); } static void alloc_btlb(unsigned long start, unsigned long end, int *slot, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 03c05ec56041..b716c9cd141c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -237,7 +237,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0 }; unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); int zone_dma_bits; @@ -269,9 +268,6 @@ void __init paging_init(void) zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - mark_nonram_nosave(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 97e8661fbcff..79b4792578c4 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -87,14 +87,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; } -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - #if defined(CONFIG_MMU) && defined(CONFIG_DEBUG_VM) #define LOG2_SZ_1K ilog2(SZ_1K) @@ -1443,7 +1435,6 @@ void __init misc_mem_init(void) /* The entire VMEMMAP region has been populated. Flush TLB for this region */ local_flush_tlb_kernel_range(VMEMMAP_START, VMEMMAP_END); #endif - zone_sizes_init(); arch_reserve_crashkernel(); memblock_dump_all(); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1c11ad84dddb..9ec608b5cbb1 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -97,14 +97,9 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) */ void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - vmem_map_init(); sparse_init(); zone_dma_limit = DMA_BIT_MASK(31); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } void mark_rodata_ro(void) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 5e7e63642611..3edee854b755 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -271,7 +271,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; unsigned long vaddr, end; sh_mv.mv_mem_init(); @@ -325,10 +324,6 @@ void __init paging_init(void) page_table_range_init(vaddr, end, swapper_pg_dir); kmap_coherent_init(); - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); } unsigned int mem_init_done = 0; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index fbaad449dfc9..931f872ce84a 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2459,17 +2459,6 @@ void __init paging_init(void) kernel_physical_mapping_init(); - { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); - } - printk("Booting Linux...\n"); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 81e90151db90..1b24c5e8d73d 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -970,13 +970,6 @@ void __init srmmu_paging_init(void) flush_tlb_all(); sparc_context_init(num_contexts); - - { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); - } } void mmu_info(struct seq_file *m) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2ac4e9debedd..89c8c8b94a79 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -91,16 +91,11 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init paging_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e7ef605a18d6..e52a262d3207 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1011,16 +1011,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) #endif } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - - arch_zone_limits_init(max_zone_pfns); - free_area_init(max_zone_pfns); -} - __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a34fff6ab2b..b55172118c91 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -655,7 +655,6 @@ void __init paging_init(void) */ olpc_dt_build_devicetree(); sparse_init(); - zone_sizes_init(); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9983017ecbe0..4daa40071c9f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -843,8 +843,6 @@ void __init paging_init(void) */ node_clear_state(0, N_MEMORY); node_clear_state(0, N_NORMAL_MEMORY); - - zone_sizes_init(); } #define PAGE_UNUSED 0xFD diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 097aadc250f7..7c4a41235323 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -17,7 +17,6 @@ unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long kernel_physical_mapping_change(unsigned long start, unsigned long end, unsigned long page_size_mask); -void zone_sizes_init(void); extern int after_bootmem; diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 60299f359a3c..fe83a68335da 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -126,10 +126,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) void __init zones_init(void) { - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; - - arch_zone_limits_init(max_zone_pfn); - free_area_init(max_zone_pfn); print_vm_layout(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 477339b7a032..aacabf8a0b58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ struct pt_regs; struct folio_batch; void arch_mm_preinit(void); +void mm_core_init_early(void); void mm_core_init(void); void init_mm_internals(void); @@ -3540,7 +3541,7 @@ static inline unsigned long get_num_physpages(void) } /* - * Using memblock node mappings, an architecture may initialise its + * FIXME: Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * @@ -3555,7 +3556,6 @@ static inline unsigned long get_num_physpages(void) * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ -void free_area_init(unsigned long *max_zone_pfn); void arch_zone_limits_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, diff --git a/init/main.c b/init/main.c index b84818ad9685..445b5643ecec 100644 --- a/init/main.c +++ b/init/main.c @@ -1025,6 +1025,7 @@ void start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); + mm_core_init_early(); /* Static keys and static calls are needed by LSMs */ jump_label_init(); static_call_init(); diff --git a/mm/mm_init.c b/mm/mm_init.c index 0927bedb1254..6fb4415c0d1c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1807,7 +1807,6 @@ static void __init set_high_memory(void) /** * free_area_init - Initialise all pg_data_t and zone data - * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each @@ -1818,17 +1817,14 @@ static void __init set_high_memory(void) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init(unsigned long *max_zone_pfn) +static void __init free_area_init(void) { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; unsigned long start_pfn, end_pfn; int i, nid, zone; bool descending; - /* Record where the zone boundaries are */ - memset(arch_zone_lowest_possible_pfn, 0, - sizeof(arch_zone_lowest_possible_pfn)); - memset(arch_zone_highest_possible_pfn, 0, - sizeof(arch_zone_highest_possible_pfn)); + arch_zone_limits_init(max_zone_pfn); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); @@ -2678,13 +2674,19 @@ void __init __weak mem_init(void) { } +void __init mm_core_init_early(void) +{ + hugetlb_bootmem_alloc(); + + free_area_init(); +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { arch_mm_preinit(); - hugetlb_bootmem_alloc(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); -- cgit v1.2.3 From 5898aa8f9a0b42fe1f65c7364010ab15ec5c38bf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 14 Jan 2026 09:36:42 -0500 Subject: mm: fix OOM killer inaccuracy on large many-core systems Use the precise, albeit slower, precise RSS counter sums for the OOM killer task selection and console dumps. The approximated value is too imprecise on large many-core systems. The following rss tracking issues were noted by Sweet Tea Dorminy [1], which lead to picking wrong tasks as OOM kill target: Recently, several internal services had an RSS usage regression as part of a kernel upgrade. Previously, they were on a pre-6.2 kernel and were able to read RSS statistics in a backup watchdog process to monitor and decide if they'd overrun their memory budget. Now, however, a representative service with five threads, expected to use about a hundred MB of memory, on a 250-cpu machine had memory usage tens of megabytes different from the expected amount -- this constituted a significant percentage of inaccuracy, causing the watchdog to act. This was a result of commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") [1]. Previously, the memory error was bounded by 64*nr_threads pages, a very livable megabyte. Now, however, as a result of scheduler decisions moving the threads around the CPUs, the memory error could be as large as a gigabyte. This is a really tremendous inaccuracy for any few-threaded program on a large machine and impedes monitoring significantly. These stat counters are also used to make OOM killing decisions, so this additional inaccuracy could make a big difference in OOM situations -- either resulting in the wrong process being killed, or in less memory being returned from an OOM-kill than expected. Here is a (possibly incomplete) list of the prior approaches that were used or proposed, along with their downside: 1) Per-thread rss tracking: large error on many-thread processes. 2) Per-CPU counters: up to 12% slower for short-lived processes and 9% increased system time in make test workloads [1]. Moreover, the inaccuracy increases with O(n^2) with the number of CPUs. 3) Per-NUMA-node counters: requires atomics on fast-path (overhead), error is high with systems that have lots of NUMA nodes (32 times the number of NUMA nodes). commit 82241a83cd15 ("mm: fix the inaccurate memory statistics issue for users") introduced get_mm_counter_sum() for precise proc memory status queries for some proc files. The simple fix proposed here is to do the precise per-cpu counters sum every time a counter value needs to be read. This applies to the OOM killer task selection, oom task console dumps (printk). This change increases the latency introduced when the OOM killer executes in favor of doing a more precise OOM target task selection. Effectively, the OOM killer iterates on all tasks, for all relevant page types, for which the precise sum iterates on all possible CPUs. As a reference, here is the execution time of the OOM killer before/after the change: AMD EPYC 9654 96-Core (2 sockets) Within a KVM, configured with 256 logical cpus. | before | after | ----------------------------------|----------|----------| nr_processes=40 | 0.3 ms | 0.5 ms | nr_processes=10000 | 3.0 ms | 80.0 ms | Link: https://lkml.kernel.org/r/20260114143642.47333-1-mathieu.desnoyers@efficios.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Link: https://lore.kernel.org/lkml/20250331223516.7810-2-sweettea-kernel@dorminy.me/ # [1] Signed-off-by: Mathieu Desnoyers Suggested-by: Michal Hocko Acked-by: Michal Hocko Reviewed-by: Baolin Wang Acked-by: Vlastimil Babka Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Martin Liu Cc: David Rientjes Cc: Shakeel Butt Cc: SeongJae Park Cc: Michal Hocko Cc: Johannes Weiner Cc: Sweet Tea Dorminy Cc: Lorenzo Stoakes Cc: "Liam R . Howlett" Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Christian Brauner Cc: Wei Yang Cc: David Hildenbrand Cc: Miaohe Lin Cc: Al Viro Cc: Yu Zhao Cc: Roman Gushchin Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Aboorva Devarajan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/oom_kill.c | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index aacabf8a0b58..aa90719234f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2906,6 +2906,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm) get_mm_counter(mm, MM_SHMEMPAGES); } +static inline unsigned long get_mm_rss_sum(struct mm_struct *mm) +{ + return get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_ANONPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); +} + static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 94066316e3ec..5c6c95c169ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); @@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg) pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), - get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), + task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm), + get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES), + get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), + get_mm_counter_sum(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES))); + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES))); out_finish: trace_finish_task_reaping(tsk->pid); out_unlock: @@ -960,9 +960,9 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mark_oom_victim(victim); pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES)), + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); -- cgit v1.2.3 From 17fd82c3abe03c1e202959bb1a7c4ab448b36bef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 23 Jan 2026 20:12:20 +0000 Subject: mm/vma: add and use vma_assert_stabilised() Sometimes we wish to assert that a VMA is stable, that is - the VMA cannot be changed underneath us. This will be the case if EITHER the VMA lock or the mmap lock is held. In order to do so, we introduce a new assert vma_assert_stabilised() - this will make a lockdep assert if lockdep is enabled AND the VMA is read-locked. Currently lockdep tracking for VMA write locks is not implemented, so it suffices to check in this case that we have either an mmap read or write semaphore held. Note that because the VMA lock uses the non-standard vmlock_dep_map naming convention, we cannot use lockdep_assert_is_write_held() so have to open code this ourselves via lockdep-asserting that lock_is_held_type(&vma->vmlock_dep_map, 0). We have to be careful here - for instance when merging a VMA, we use the mmap write lock to stabilise the examination of adjacent VMAs which might be simultaneously VMA read-locked whilst being faulted in. If we were to assert VMA read lock using lockdep we would encounter an incorrect lockdep assert. Also, we have to be careful about asserting mmap locks are held - if we try to address the above issue by first checking whether mmap lock is held and if so asserting it via lockdep, we may find that we were raced by another thread acquiring an mmap read lock simultaneously that either we don't own (and thus can be released any time - so we are not stable) or was indeed released since we last checked. So to deal with these complexities we end up with either a precise (if lockdep is enabled) or imprecise (if not) approach - in the first instance we assert the lock is held using lockdep and thus whether we own it. If we do own it, then the check is complete, otherwise we must check for the VMA read lock being held (VMA write lock implies mmap write lock so the mmap lock suffices for this). If lockdep is not enabled we simply check if the mmap lock is held and risk a false negative (i.e. not asserting when we should do). There are a couple places in the kernel where we already do this stabliisation check - the anon_vma_name() helper in mm/madvise.c and vma_flag_set_atomic() in include/linux/mm.h, which we update to use vma_assert_stabilised(). This change abstracts these into vma_assert_stabilised(), uses lockdep if possible, and avoids a duplicate check of whether the mmap lock is held. This is also self-documenting and lays the foundations for further VMA stability checks in the code. The only functional change here is adding the lockdep check. Link: https://lkml.kernel.org/r/6c9e64bb2b56ddb6f806fde9237f8a00cb3a776b.1769198904.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Boqun Feng Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shakeel Butt Cc: Waiman Long Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +---- include/linux/mmap_lock.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 4 +--- 3 files changed, 54 insertions(+), 7 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index aa90719234f1..2c6c6d00ed73 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1008,10 +1008,7 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, { unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); - /* mmap read lock/VMA read lock must be held. */ - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); if (__vma_flag_atomic_valid(vma, bit)) set_bit((__force int)bit, bitmap); } diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 90fc32b683dd..93eca48bc443 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -374,6 +374,52 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } +/** + * vma_assert_stabilised() - assert that this VMA cannot be changed from + * underneath us either by having a VMA or mmap lock held. + * @vma: The VMA whose stability we wish to assess. + * + * If lockdep is enabled we can precisely ensure stability via either an mmap + * lock owned by us or a specific VMA lock. + * + * With lockdep disabled we may sometimes race with other threads acquiring the + * mmap read lock simultaneous with our VMA read lock. + */ +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* + * If another thread owns an mmap lock, it may go away at any time, and + * thus is no guarantee of stability. + * + * If lockdep is enabled we can accurately determine if an mmap lock is + * held and owned by us. Otherwise we must approximate. + * + * It doesn't necessarily mean we are not stabilised however, as we may + * hold a VMA read lock (not a write lock as this would require an owned + * mmap lock). + * + * If (assuming lockdep is not enabled) we were to assert a VMA read + * lock first we may also run into issues, as other threads can hold VMA + * read locks simlutaneous to us. + * + * Therefore if lockdep is not enabled we risk a false negative (i.e. no + * assert fired). If accurate checking is required, enable lockdep. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (lockdep_is_held(&vma->vm_mm->mmap_lock)) + return; + } else { + if (rwsem_is_locked(&vma->vm_mm->mmap_lock)) + return; + } + + /* + * We're not stabilised by the mmap lock, so assert that we're + * stabilised by a VMA lock. + */ + vma_assert_locked(vma); +} + static inline bool vma_is_attached(struct vm_area_struct *vma) { return refcount_read(&vma->vm_refcnt); @@ -476,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); } +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* If no VMA locks, then either mmap lock suffices to stabilise. */ + mmap_assert_locked(vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) diff --git a/mm/madvise.c b/mm/madvise.c index 863d55b8a658..19cf480eed49 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -109,9 +109,7 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); return vma->anon_name; } -- cgit v1.2.3