From dcdfdd40fa82b6704d2841938e5c8ec3051eb0d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 6 Jun 2023 17:26:29 +0300 Subject: mm: Add support for unaccepted memory UEFI Specification version 2.9 introduces the concept of memory acceptance. Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, require memory to be accepted before it can be used by the guest. Accepting happens via a protocol specific to the Virtual Machine platform. There are several ways the kernel can deal with unaccepted memory: 1. Accept all the memory during boot. It is easy to implement and it doesn't have runtime cost once the system is booted. The downside is very long boot time. Accept can be parallelized to multiple CPUs to keep it manageable (i.e. via DEFERRED_STRUCT_PAGE_INIT), but it tends to saturate memory bandwidth and does not scale beyond the point. 2. Accept a block of memory on the first use. It requires more infrastructure and changes in page allocator to make it work, but it provides good boot time. On-demand memory accept means latency spikes every time kernel steps onto a new memory block. The spikes will go away once workload data set size gets stabilized or all memory gets accepted. 3. Accept all memory in background. Introduce a thread (or multiple) that gets memory accepted proactively. It will minimize time the system experience latency spikes on memory allocation while keeping low boot time. This approach cannot function on its own. It is an extension of #2: background memory acceptance requires functional scheduler, but the page allocator may need to tap into unaccepted memory before that. The downside of the approach is that these threads also steal CPU cycles and memory bandwidth from the user's workload and may hurt user experience. Implement #1 and #2 for now. #2 is the default. Some workloads may want to use #1 with accept_memory=eager in kernel command line. #3 can be implemented later based on user's demands. Support of unaccepted memory requires a few changes in core-mm code: - memblock accepts memory on allocation. It serves early boot memory allocations and doesn't limit them to pre-accepted pool of memory. - page allocator accepts memory on the first allocation of the page. When kernel runs out of accepted memory, it accepts memory until the high watermark is reached. It helps to minimize fragmentation. EFI code will provide two helpers if the platform supports unaccepted memory: - accept_memory() makes a range of physical addresses accepted. - range_contains_unaccepted_memory() checks anything within the range of physical addresses requires acceptance. Signed-off-by: Kirill A. Shutemov Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Vlastimil Babka Acked-by: Mike Rapoport # memblock Link: https://lore.kernel.org/r/20230606142637.5171-2-kirill.shutemov@linux.intel.com --- mm/mm_init.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677854..1cfc08e25f93 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone) INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; } + +#ifdef CONFIG_UNACCEPTED_MEMORY + INIT_LIST_HEAD(&zone->unaccepted_pages); +#endif } void __meminit init_currently_empty_zone(struct zone *zone, @@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn, return; } + /* Accept chunks smaller than MAX_ORDER upfront */ + accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); -- cgit v1.2.3 From 072ba380cefc7722c9442cc14a9c2810898c13ac Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:09 +0800 Subject: mm: page_alloc: move mirrored_kernelcore into mm_init.c Patch series "mm: page_alloc: misc cleanup and refactor", v2. This aims to reduce more space in page_alloc.c, also do some cleanup, no functional changes intended. This patch (of 13): Since commit 9420f89db2dd ("mm: move most of core MM initialization to mm/mm_init.c"), mirrored_kernelcore should be moved into mm_init.c, as most related codes are already there. Link: https://lkml.kernel.org/r/20230516063821.121844-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230516063821.121844-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 ++ mm/page_alloc.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677854..da162b7a044c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core, return 0; } +bool mirrored_kernelcore __initdata_memblock; + /* * kernelcore=size sets the amount of memory for use for allocations that * cannot be reclaimed or migrated. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index af9c995d3c1e..d1086aeca8f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -374,8 +373,6 @@ int user_min_free_kbytes = -1; int watermark_boost_factor __read_mostly = 15000; int watermark_scale_factor = 10; -bool mirrored_kernelcore __initdata_memblock; - /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -- cgit v1.2.3 From 5e7d5da2f41c1d762cd1dbdd97758be6c414ea29 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:10 +0800 Subject: mm: page_alloc: move init_on_alloc/free() into mm_init.c Since commit f2fc4b44ec2b ("mm: move init_mem_debugging_and_hardening() to mm/mm_init.c"), the init_on_alloc() and init_on_free() define is better to move there too. Link: https://lkml.kernel.org/r/20230516063821.121844-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 ++++++ mm/page_alloc.c | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index da162b7a044c..15201887f8e0 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2543,6 +2543,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, __free_pages_core(page, order); } +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); +EXPORT_SYMBOL(init_on_alloc); + +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); +EXPORT_SYMBOL(init_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1086aeca8f2..4f094ba7c8fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -233,11 +233,6 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); -EXPORT_SYMBOL(init_on_alloc); - -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); -EXPORT_SYMBOL(init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is -- cgit v1.2.3 From 904d58578fce531be07619a2bc2cdc16c9fd49b6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:11 +0800 Subject: mm: page_alloc: move set_zone_contiguous() into mm_init.c set_zone_contiguous() is only used in mm init/hotplug, and clear_zone_contiguous() only used in hotplug, move them from page_alloc.c to the more appropriate file. Link: https://lkml.kernel.org/r/20230516063821.121844-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 3 --- mm/internal.h | 7 +++++++ mm/mm_init.c | 22 ++++++++++++++++++++++ mm/page_alloc.c | 27 --------------------------- 4 files changed, 29 insertions(+), 30 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 9fcbf5706595..04bc286eed42 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -326,9 +326,6 @@ static inline int remove_memory(u64 start, u64 size) static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..c99da2cfac71 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -371,6 +371,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } +void set_zone_contiguous(struct zone *zone); + +static inline void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); diff --git a/mm/mm_init.c b/mm/mm_init.c index 15201887f8e0..0fd4ddfdfb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2330,6 +2330,28 @@ void __init init_cma_reserved_pageblock(struct page *page) } #endif +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = pageblock_end_pfn(block_start_pfn); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + cond_resched(); + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + void __init page_alloc_init_late(void) { struct zone *zone; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f094ba7c8fb..7bb0d6abfe3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1532,33 +1532,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, return start_page; } -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - -void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression -- cgit v1.2.3 From e95d372c4cd46b6ec4eeacc07adcb7260ab4cfa0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:20 +0800 Subject: mm: page_alloc: move sysctls into it own fils This moves all page alloc related sysctls to its own file, as part of the kernel/sysctl.c spring cleaning, also move some functions declarations from mm.h into internal.h. Link: https://lkml.kernel.org/r/20230516063821.121844-13-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ------ include/linux/mmzone.h | 21 ---------- kernel/sysctl.c | 67 -------------------------------- mm/internal.h | 11 ++++++ mm/mm_init.c | 2 + mm/page_alloc.c | 103 ++++++++++++++++++++++++++++++++++++++++++------- 6 files changed, 102 insertions(+), 113 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2382eaf6fd81..6d7e03d83da7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2994,12 +2994,6 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_range(unsigned long, int, unsigned long, - unsigned long, unsigned long, enum meminit_context, - struct vmem_altmap *, int migratetype); -extern void setup_per_zone_wmarks(void); -extern void calculate_min_free_kbytes(void); -extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); @@ -3020,11 +3014,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -/* page_alloc.c */ -extern int min_free_kbytes; -extern int watermark_boost_factor; -extern int watermark_scale_factor; - /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a4889c9d4055..3a68326c9989 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1512,27 +1512,6 @@ static inline bool has_managed_dma(void) } #endif -/* These two functions are used to setup the per zone pages min values */ -struct ctl_table; - -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int numa_zonelist_order_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -extern int percpu_pagelist_high_fraction; -extern char numa_zonelist_order[]; -#define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NUMA diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..a57de67f032f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2119,13 +2119,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, { .procname = "drop_caches", .data = &sysctl_drop_caches, @@ -2135,39 +2128,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_THREE_THOUSAND, - }, - { - .procname = "percpu_pagelist_high_fraction", - .data = &percpu_pagelist_high_fraction, - .maxlen = sizeof(percpu_pagelist_high_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, @@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, #endif #ifdef CONFIG_SMP { @@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = { .proc_handler = mmap_min_addr_handler, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/mm/internal.h b/mm/internal.h index c99da2cfac71..66d7ddf7e211 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -213,6 +213,13 @@ static inline bool is_check_pages_enabled(void) return static_branch_unlikely(&check_pages_enabled); } +extern int min_free_kbytes; + +void setup_per_zone_wmarks(void); +void calculate_min_free_kbytes(void); +int __meminit init_per_zone_wmark_min(void); +void page_alloc_sysctl_init(void); + /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* @@ -423,6 +430,10 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); +void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); + + int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); diff --git a/mm/mm_init.c b/mm/mm_init.c index 0fd4ddfdfb2e..10bf560302c4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2392,6 +2392,8 @@ void __init page_alloc_init_late(void) /* Initialize page ext after all struct pages are initialized. */ if (deferred_struct_pages) page_ext_init(); + + page_alloc_sysctl_init(); } #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 005aa0202ae0..d19a05264125 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -206,7 +206,6 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; /* @@ -302,8 +301,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; -int watermark_scale_factor = 10; +static int watermark_boost_factor __read_mostly = 15000; +static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4880,12 +4879,12 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -char numa_zonelist_order[] = "Node"; - +static char numa_zonelist_order[] = "Node"; +#define NUMA_ZONELIST_ORDER_LEN 16 /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(struct ctl_table *table, int write, +static int numa_zonelist_order_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -4893,7 +4892,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, return proc_dostring(table, write, buffer, length, ppos); } - static int node_load[MAX_NUMNODES]; /** @@ -5296,6 +5294,7 @@ static int zone_batchsize(struct zone *zone) #endif } +static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU @@ -5825,7 +5824,7 @@ postcore_initcall(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, +static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5841,7 +5840,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5871,7 +5870,7 @@ static void setup_min_unmapped_ratio(void) } -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5898,7 +5897,7 @@ static void setup_min_slab_ratio(void) sysctl_min_slab_ratio) / 100; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5922,8 +5921,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) +static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) { int i; @@ -5943,7 +5942,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, +static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5976,6 +5975,82 @@ out: return ret; } +static struct ctl_table page_alloc_sysctl_table[] = { + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE_THOUSAND, + }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif + {} +}; + +void __init page_alloc_sysctl_init(void) +{ + register_sysctl_init("vm", page_alloc_sysctl_table); +} + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) -- cgit v1.2.3 From ba1b67c79cb3c5f5d11cb475bb7045929b235538 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Fri, 26 May 2023 08:52:50 +0000 Subject: mm/mm_init.c: introduce reset_memoryless_node_totalpages() Currently, no matter whether a node actually has memory or not, calculate_node_totalpages() is used to account number of pages in zone/node. However, for node without memory, these unnecessary calculations can be skipped. All the zone/node page counts can be set to 0 directly. So introduce reset_memoryless_node_totalpages() to perform this action. Furthermore, calculate_node_totalpages() only gets called for the node with memory. Link: https://lkml.kernel.org/r/20230526085251.1977-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/mm_init.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 10bf560302c4..6f7da396b67b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1176,10 +1176,6 @@ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_start_pfn, zone_end_pfn; unsigned long nr_absent; - /* When hotadd a new node from cpu_up(), the node should be empty */ - if (!node_start_pfn && !node_end_pfn) - return 0; - zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -1229,9 +1225,6 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - /* When hotadd a new node from cpu_up(), the node should be empty */ - if (!node_start_pfn && !node_end_pfn) - return 0; /* Get the start and end of the zone */ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); @@ -1252,6 +1245,24 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, return *zone_end_pfn - *zone_start_pfn; } +static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) { + z->zone_start_pfn = 0; + z->spanned_pages = 0; + z->present_pages = 0; +#if defined(CONFIG_MEMORY_HOTPLUG) + z->present_early_pages = 0; +#endif + } + + pgdat->node_spanned_pages = 0; + pgdat->node_present_pages = 0; + pr_debug("On node %d totalpages: 0\n", pgdat->node_id); +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) @@ -1704,11 +1715,13 @@ static void __init free_area_init_node(int nid) pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + + calculate_node_totalpages(pgdat, start_pfn, end_pfn); } else { pr_info("Initmem setup node %d as memoryless\n", nid); - } - calculate_node_totalpages(pgdat, start_pfn, end_pfn); + reset_memoryless_node_totalpages(pgdat); + } alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); -- cgit v1.2.3 From 1c2d252f5b4289e1c6840bcf394157b70c639d6e Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Fri, 26 May 2023 08:52:51 +0000 Subject: mm/mm_init.c: do not calculate zone_start_pfn/zone_end_pfn in zone_absent_pages_in_node() In calculate_node_totalpages(), zone_start_pfn/zone_end_pfn are already calculated in zone_spanned_pages_in_node(), so use them as parameters instead of node_start_pfn/node_end_pfn and the duplicated calculation process can de dropped. Link: https://lkml.kernel.org/r/20230526085251.1977-2-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: Haifeng Xu Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/mm_init.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 6f7da396b67b..19652c1b3426 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1168,20 +1168,15 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn) + unsigned long zone_start_pfn, + unsigned long zone_end_pfn) { - unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; - unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long zone_start_pfn, zone_end_pfn; unsigned long nr_absent; - zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); - zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); + /* zone is empty, we don't have any absent pages */ + if (zone_start_pfn == zone_end_pfn) + return 0; - adjust_zone_range_for_zone_movable(nid, zone_type, - node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* @@ -1274,7 +1269,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; unsigned long spanned, absent; - unsigned long size, real_size; + unsigned long real_size; spanned = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, @@ -1282,23 +1277,22 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, &zone_start_pfn, &zone_end_pfn); absent = zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn); + zone_start_pfn, + zone_end_pfn); - size = spanned; - real_size = size - absent; + real_size = spanned - absent; - if (size) + if (spanned) zone->zone_start_pfn = zone_start_pfn; else zone->zone_start_pfn = 0; - zone->spanned_pages = size; + zone->spanned_pages = spanned; zone->present_pages = real_size; #if defined(CONFIG_MEMORY_HOTPLUG) zone->present_early_pages = real_size; #endif - totalpages += size; + totalpages += spanned; realtotalpages += real_size; } -- cgit v1.2.3 From 837c2ba56d6fd1ecf7a1c5aa0cdc872f3b74185b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Sun, 28 May 2023 04:57:20 +0000 Subject: mm/mm_init.c: remove free_area_init_memoryless_node() free_area_init_memoryless_node() is just a wrapper of free_area_init_node(), remove it to clean up. Link: https://lkml.kernel.org/r/20230528045720.4835-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/mm_init.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 19652c1b3426..3ddd18a89b66 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1754,11 +1754,6 @@ void __init setup_nr_node_ids(void) } #endif -static void __init free_area_init_memoryless_node(int nid) -{ - free_area_init_node(nid); -} - /* * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order @@ -1869,7 +1864,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); - free_area_init_memoryless_node(nid); + free_area_init_node(nid); /* * We do not want to confuse userspace by sysfs -- cgit v1.2.3 From e3d9b45fb17cfddb1c414b5981743d4245fcf486 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 1 Jun 2023 06:35:35 +0000 Subject: mm/mm_init.c: move set_pageblock_order() to free_area_init() pageblock_order only needs to be set once, there is no need to initialize it in every zone/node. Link: https://lkml.kernel.org/r/20230601063536.26882-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: Mike Rapoport (IBM) Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/mm_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 3ddd18a89b66..015355dfdc0b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1585,7 +1585,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) if (!size) continue; - set_pageblock_order(); setup_usemap(zone); init_currently_empty_zone(zone, zone->zone_start_pfn, size); } @@ -1852,6 +1851,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); + set_pageblock_order(); + for_each_node(nid) { pg_data_t *pgdat; -- cgit v1.2.3 From 91ff4d754a1895feb4216e94028edd76cbbc0770 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Wed, 7 Jun 2023 03:24:02 +0000 Subject: mm/mm_init.c: drop 'nid' parameter from check_for_memory() The node_id in pgdat has already been set in free_area_init_node(), so use it internally instead of passing a redundant parameter. Link: https://lkml.kernel.org/r/20230607032402.4679-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: Mike Rapoport (IBM) Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/mm_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 015355dfdc0b..25a585cb0c0a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1724,7 +1724,7 @@ static void __init free_area_init_node(int nid) } /* Any regular or high memory on that node ? */ -static void check_for_memory(pg_data_t *pgdat, int nid) +static void check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; @@ -1732,9 +1732,9 @@ static void check_for_memory(pg_data_t *pgdat, int nid) struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { if (IS_ENABLED(CONFIG_HIGHMEM)) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(pgdat->node_id, N_HIGH_MEMORY); if (zone_type <= ZONE_NORMAL) - node_set_state(nid, N_NORMAL_MEMORY); + node_set_state(pgdat->node_id, N_NORMAL_MEMORY); break; } } @@ -1886,7 +1886,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); - check_for_memory(pgdat, nid); + check_for_memory(pgdat); } memmap_init(); -- cgit v1.2.3 From 32b6a4a1745a46918f748f6fb7641e588fbec6f2 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Wed, 7 Jun 2023 02:50:56 +0000 Subject: mm/mm_init.c: remove reset_node_present_pages() reset_node_present_pages() only get called in hotadd_init_pgdat(), move the action that clear present pages to free_area_init_core_hotplug(), so the helper can be removed. Link: https://lkml.kernel.org/r/20230607025056.1348-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 18 ------------------ mm/mm_init.c | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5248323fc0f7..35db4108bb15 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1156,16 +1156,6 @@ failed_addition: return ret; } -static void reset_node_present_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->present_pages = 0; - - pgdat->node_present_pages = 0; -} - /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_init_pgdat(int nid) { @@ -1188,14 +1178,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) */ build_all_zonelists(pgdat); - /* - * When memory is hot-added, all the memory is in offline state. So - * clear all zones' present_pages because they will be updated in - * online_pages() and offline_pages(). - * TODO: should be in free_area_init_core_hotplug? - */ - reset_node_present_pages(pgdat); - return pgdat; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 25a585cb0c0a..122e9bf3fa73 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1509,6 +1509,8 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) pgdat->kswapd_order = 0; pgdat->kswapd_highest_zoneidx = 0; pgdat->node_start_pfn = 0; + pgdat->node_present_pages = 0; + for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -1516,8 +1518,17 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) memset(p, 0, sizeof(*p)); } - for (z = 0; z < MAX_NR_ZONES; z++) - zone_init_internals(&pgdat->node_zones[z], z, nid, 0); + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages and managed_pages because they will + * be updated in online_pages() and offline_pages(). + */ + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + zone->present_pages = 0; + zone_init_internals(zone, z, nid, 0); + } } #endif -- cgit v1.2.3 From 61167ad5fecdeaa037f3df1ba354dddd5f66a1ed Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Mon, 19 Jun 2023 10:34:06 +0800 Subject: mm: pass nid to reserve_bootmem_region() early_pfn_to_nid() is called frequently in init_reserved_page(), it returns the node id of the PFN. These PFN are probably from the same memory region, they have the same node id. It's not necessary to call early_pfn_to_nid() for each PFN. Pass nid to reserve_bootmem_region() and drop the call to early_pfn_to_nid() in init_reserved_page(). Also, set nid on all reserved pages before doing this, as some reserved memory regions may not be set nid. The most beneficial function is memmap_init_reserved_pages() if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. The following data was tested on an x86 machine with 190GB of RAM. before: memmap_init_reserved_pages() 67ms after: memmap_init_reserved_pages() 20ms Link: https://lkml.kernel.org/r/20230619023406.424298-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- mm/memblock.c | 31 +++++++++++++++++++++---------- mm/mm_init.c | 30 +++++++++++++++++------------- 3 files changed, 40 insertions(+), 24 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf43deb25553..9ecb8b9c07f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2940,7 +2940,8 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void adjust_managed_page_count(struct page *page, long count); -extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); +extern void reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) diff --git a/mm/memblock.c b/mm/memblock.c index da4264528e1e..46739551d4d1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2082,19 +2082,30 @@ static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; phys_addr_t start, end; - u64 i; + int nid; + + /* + * set nid on all reserved pages and also treat struct + * pages for the NOMAP regions as PageReserved + */ + for_each_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; + + if (memblock_is_nomap(region)) + reserve_bootmem_region(start, end, nid); + + memblock_set_node(start, end, &memblock.reserved, nid); + } /* initialize struct pages for the reserved regions */ - for_each_reserved_mem_range(i, &start, &end) - reserve_bootmem_region(start, end); + for_each_reserved_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; - /* and also treat struct pages for the NOMAP regions as PageReserved */ - for_each_mem_region(region) { - if (memblock_is_nomap(region)) { - start = region->base; - end = start + region->size; - reserve_bootmem_region(start, end); - } + reserve_bootmem_region(start, end, nid); } } diff --git a/mm/mm_init.c b/mm/mm_init.c index 122e9bf3fa73..7ffa609673ea 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -646,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) } /* Returns true if the struct page for the pfn is initialised */ -static inline bool __meminit early_page_initialised(unsigned long pfn) +static inline bool __meminit early_page_initialised(unsigned long pfn, int nid) { - int nid = early_pfn_to_nid(pfn); - if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return false; @@ -695,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn, int nid) { pg_data_t *pgdat; - int nid, zid; + int zid; - if (early_page_initialised(pfn)) + if (early_page_initialised(pfn, nid)) return; - nid = early_pfn_to_nid(pfn); pgdat = NODE_DATA(nid); for (zid = 0; zid < MAX_NR_ZONES; zid++) { @@ -717,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn) #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} -static inline bool early_page_initialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn, int nid) { return true; } @@ -727,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_reserved_page(unsigned long pfn) +static inline void init_reserved_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -738,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) +void __meminit reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -747,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) if (pfn_valid(start_pfn)) { struct page *page = pfn_to_page(start_pfn); - init_reserved_page(start_pfn); + init_reserved_page(start_pfn, nid); /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); @@ -2572,8 +2570,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (!early_page_initialised(pfn)) - return; + + if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { + int nid = early_pfn_to_nid(pfn); + + if (!early_page_initialised(pfn, nid)) + return; + } + if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ return; -- cgit v1.2.3