From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 2 Sep 2021 14:52:58 -0700 Subject: mm: report a more useful address for reclaim acquisition A recent lockdep report included these lines: [ 96.177910] 3 locks held by containerd/770: [ 96.177934] #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x115/0x770 [ 96.177999] #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at: get_swap_device+0x33/0x140 [ 96.178057] #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 While it was not useful to that bug report to know where the reclaim lock had been acquired, it might be useful under other circumstances. Allow the caller of __fs_reclaim_acquire to specify the instruction pointer to use. Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eeb3a9cb36bb..51c17bf7b127 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask) return true; } -void __fs_reclaim_acquire(void) +void __fs_reclaim_acquire(unsigned long ip) { - lock_map_acquire(&__fs_reclaim_map); + lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); } -void __fs_reclaim_release(void) +void __fs_reclaim_release(unsigned long ip) { - lock_map_release(&__fs_reclaim_map); + lock_release(&__fs_reclaim_map, ip); } void fs_reclaim_acquire(gfp_t gfp_mask) @@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_RET_IP_); #ifdef CONFIG_MMU_NOTIFIER lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); @@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_release(); + __fs_reclaim_release(_RET_IP_); } } EXPORT_SYMBOL_GPL(fs_reclaim_release); -- cgit v1.2.3 From eb2169cee36fc492407a2d483c286b977a46288a Mon Sep 17 00:00:00 2001 From: liuhailong Date: Thu, 2 Sep 2021 14:53:01 -0700 Subject: mm: add kernel_misc_reclaimable in show_free_areas Print NR_KERNEL_MISC_RECLAIMABLE stat from show_free_areas() so users can check whether the shrinker is working correctly and to show the current memory usage. Link: https://lkml.kernel.org/r/20210813104725.4562-1-liuhailong@oppo.com Signed-off-by: liuhailong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51c17bf7b127..60a867a549bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), @@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); -- cgit v1.2.3 From c3ab6baf6a004eab7344a1d8880a971f2414e1b6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:57:56 -0700 Subject: mm/page_alloc: always initialize memory map for the holes Patch series "mm: ensure consistency of memory map poisoning". Currently memory map allocation for FLATMEM case does not poison the struct pages regardless of CONFIG_PAGE_POISON setting. This happens because allocation of the memory map for FLATMEM and SPARSMEM use different memblock functions and those that are used for SPARSMEM case (namely memblock_alloc_try_nid_raw() and memblock_alloc_exact_nid_raw()) implicitly poison the allocated memory. Another side effect of this implicit poisoning is that early setup code that uses the same functions to allocate memory burns cycles for the memory poisoning even if it was not intended. These patches introduce memmap_alloc() wrapper that ensure that the memory map allocation is consistent for different memory models. This patch (of 4): Currently memory map for the holes is initialized only when SPARSEMEM memory model is used. Yet, even with FLATMEM there could be holes in the physical memory layout that have memory map entries. For instance, the memory reserved using e820 API on i386 or "reserved-memory" nodes in device tree would not appear in memblock.memory and hence the struct pages for such holes will be skipped during memory map initialization. These struct pages will be zeroed because the memory map for FLATMEM systems is allocated with memblock_alloc_node() that clears the allocated memory. While zeroed struct pages do not cause immediate problems, the correct behaviour is to initialize every page using __init_single_page(). Besides, enabling page poison for FLATMEM case will trigger PF_POISONED_CHECK() unless the memory map is properly initialized. Make sure init_unavailable_range() is called for both SPARSEMEM and FLATMEM so that struct pages representing memory holes would appear as PG_Reserved with any memory layout. [rppt@kernel.org: fix microblaze] Link: https://lkml.kernel.org/r/YQWW3RCE4eWBuMu/@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Tested-by: Guenter Roeck Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/page.h | 3 +-- mm/page_alloc.c | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index ce550978f4fc..4b8b2fa78fc5 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) - +# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60a867a549bd..bd2efd6287ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6642,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -6687,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn, pr_info("On node %d, zone %s: %lld pages in unavailable ranges", node, zone_names[zone], pgcnt); } -#else -static inline void init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) -{ -} -#endif static void __init memmap_init_zone_range(struct zone *zone, unsigned long start_pfn, -- cgit v1.2.3 From c803b3c8b3b70f306ee6300bf8acdd70ffd1441a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:58:02 -0700 Subject: mm: introduce memmap_alloc() to unify memory map allocation There are several places that allocate memory for the memory map: alloc_node_mem_map() for FLATMEM, sparse_buffer_init() and __populate_section_memmap() for SPARSEMEM. The memory allocated in the FLATMEM case is zeroed and it is never poisoned, regardless of CONFIG_PAGE_POISON setting. The memory allocated in the SPARSEMEM cases is not zeroed and it is implicitly poisoned inside memblock if CONFIG_PAGE_POISON is set. Introduce memmap_alloc() wrapper for memblock allocators that will be used for both FLATMEM and SPARSEMEM cases and will makei memory map zeroing and poisoning consistent for different memory models. Link: https://lkml.kernel.org/r/20210714123739.16493-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++++ mm/page_alloc.c | 24 ++++++++++++++++++++++-- mm/sparse.c | 6 ++---- 3 files changed, 28 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 31ff935b2547..57e28261a3b1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); +extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd2efd6287ce..71ad97c96075 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6748,6 +6748,26 @@ static void __init memmap_init(void) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } +void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU @@ -7519,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node(size, SMP_CACHE_BYTES, - pgdat->node_id); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); diff --git a/mm/sparse.c b/mm/sparse.c index fd29b3abffa0..120bc8ea5293 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -436,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, size, addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", __func__, size, PAGE_SIZE, nid, &addr); @@ -464,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), - addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; } -- cgit v1.2.3 From b346075fcf5dda7f9e9ae671703aae60e8a94564 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 2 Sep 2021 14:58:08 -0700 Subject: mm/page_alloc.c: fix 'zone_id' may be used uninitialized in this function warning When compiling with -Werror, cc1 will warn that 'zone_id' may be used uninitialized in this function warning. Initialize the zone_id as 0. Its safe to assume that if the code reaches this point it has at least one numa node with memory, so no need for an assertion before init_unavilable_range. Link: https://lkml.kernel.org/r/20210716210336.1114114-1-npache@redhat.com Fixes: 122e093c1734 ("mm/page_alloc: fix memory map initialization for descending nodes") Signed-off-by: Nico Pache Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71ad97c96075..04021e37120e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6715,7 +6715,7 @@ static void __init memmap_init(void) { unsigned long start_pfn, end_pfn; unsigned long hole_pfn = 0; - int i, j, zone_id, nid; + int i, j, zone_id = 0, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { struct pglist_data *node = NODE_DATA(nid); -- cgit v1.2.3 From 3b446da6be7a722d769e23f68dbaf4ebb2eda542 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:58:10 -0700 Subject: mm/page_alloc: make alloc_node_mem_map() __init rather than __ref alloc_node_mem_map() is never only called from free_area_init_node() that is an __init function. Make the actual alloc_node_mem_map() also __init and its stub version static inline. Link: https://lkml.kernel.org/r/20210716064124.31865-1-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04021e37120e..05079b4ae7bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7515,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } #ifdef CONFIG_FLATMEM -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -7561,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif } #else -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -- cgit v1.2.3 From 88dc6f208829cfdbc0b96495c5c73a6af0559300 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:58:13 -0700 Subject: mm/page_alloc.c: use in_task() Obsoleted in_intrrupt() include task context with disabled BH, it's better to use in_task() instead. Link: https://lkml.kernel.org/r/877caa99-1994-5545-92d2-d0bb2e394182@virtuozzo.com Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05079b4ae7bc..eaa936efad7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; show_mem(filter, nodemask); @@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(current)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_HARDER; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. */ - if (!in_interrupt() && !ac->nodemask) + if (in_task() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; -- cgit v1.2.3 From 79c28a41672278283fa72e03d0bf80e6644d4ac4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:06 -0700 Subject: mm/numa: automatically generate node migration order Patch series "Migrate Pages in lieu of discard", v11. We're starting to see systems with more and more kinds of memory such as Intel's implementation of persistent memory. Let's say you have a system with some DRAM and some persistent memory. Today, once DRAM fills up, reclaim will start and some of the DRAM contents will be thrown out. Allocations will, at some point, start falling over to the slower persistent memory. That has two nasty properties. First, the newer allocations can end up in the slower persistent memory. Second, reclaimed data in DRAM are just discarded even if there are gobs of space in persistent memory that could be used. This patchset implements a solution to these problems. At the end of the reclaim process in shrink_page_list() just before the last page refcount is dropped, the page is migrated to persistent memory instead of being dropped. While I've talked about a DRAM/PMEM pairing, this approach would function in any environment where memory tiers exist. This is not perfect. It "strands" pages in slower memory and never brings them back to fast DRAM. Huang Ying has follow-on work which repurposes NUMA balancing to promote hot pages back to DRAM. This is also all based on an upstream mechanism that allows persistent memory to be onlined and used as if it were volatile: http://lkml.kernel.org/r/20190124231441.37A4A305@viggo.jf.intel.com With that, the DRAM and PMEM in each socket will be represented as 2 separate NUMA nodes, with the CPUs sit in the DRAM node. So the general inter-NUMA demotion mechanism introduced in the patchset can migrate the cold DRAM pages to the PMEM node. We have tested the patchset with the postgresql and pgbench. On a 2-socket server machine with DRAM and PMEM, the kernel with the patchset can improve the score of pgbench up to 22.1% compared with that of the DRAM only + disk case. This comes from the reduced disk read throughput (which reduces up to 70.8%). == Open Issues == * Memory policies and cpusets that, for instance, restrict allocations to DRAM can be demoted to PMEM whenever they opt in to this new mechanism. A cgroup-level API to opt-in or opt-out of these migrations will likely be required as a follow-on. * Could be more aggressive about where anon LRU scanning occurs since it no longer necessarily involves I/O. get_scan_count() for instance says: "If we have no swap space, do not bother scanning anon pages" This patch (of 9): Prepare for the kernel to auto-migrate pages to other memory nodes with a node migration table. This allows creating single migration target for each NUMA node to enable the kernel to do NUMA page migrations instead of simply discarding colder pages. A node with no target is a "terminal node", so reclaim acts normally there. The migration target does not fundamentally _need_ to be a single node, but this implementation starts there to limit complexity. When memory fills up on a node, memory contents can be automatically migrated to another node. The biggest problems are knowing when to migrate and to where the migration should be targeted. The most straightforward way to generate the "to where" list would be to follow the page allocator fallback lists. Those lists already tell us if memory is full where to look next. It would also be logical to move memory in that order. But, the allocator fallback lists have a fatal flaw: most nodes appear in all the lists. This would potentially lead to migration cycles (A->B, B->A, A->B, ...). Instead of using the allocator fallback lists directly, keep a separate node migration ordering. But, reuse the same data used to generate page allocator fallback in the first place: find_next_best_node(). This means that the firmware data used to populate node distances essentially dictates the ordering for now. It should also be architecture-neutral since all NUMA architectures have a working find_next_best_node(). RCU is used to allow lock-less read of node_demotion[] and prevent demotion cycles been observed. If multiple reads of node_demotion[] are performed, a single rcu_read_lock() must be held over all reads to ensure no cycles are observed. Details are as follows. === What does RCU provide? === Imagine a simple loop which walks down the demotion path looking for the last node: terminal_node = start_node; while (node_demotion[terminal_node] != NUMA_NO_NODE) { terminal_node = node_demotion[terminal_node]; } The initial values are: node_demotion[0] = 1; node_demotion[1] = NUMA_NO_NODE; and are updated to: node_demotion[0] = NUMA_NO_NODE; node_demotion[1] = 0; What guarantees that the cycle is not observed: node_demotion[0] = 1; node_demotion[1] = 0; and would loop forever? With RCU, a rcu_read_lock/unlock() can be placed around the loop. Since the write side does a synchronize_rcu(), the loop that observed the old contents is known to be complete before the synchronize_rcu() has completed. RCU, combined with disable_all_migrate_targets(), ensures that the old migration state is not visible by the time __set_migration_target_nodes() is called. === What does READ_ONCE() provide? === READ_ONCE() forbids the compiler from merging or reordering successive reads of node_demotion[]. This ensures that any updates are *eventually* observed. Consider the above loop again. The compiler could theoretically read the entirety of node_demotion[] into local storage (registers) and never go back to memory, and *permanently* observe bad values for node_demotion[]. Note: RCU does not provide any universal compiler-ordering guarantees: https://lore.kernel.org/lkml/20150921204327.GH4029@linux.vnet.ibm.com/ This code is unused for now. It will be called later in the series. Link: https://lkml.kernel.org/r/20210721063926.3024591-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-2-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ++ mm/migrate.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 2 +- 3 files changed, 222 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 57e28261a3b1..cf3cb933eba3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -543,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } +static inline int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + return NUMA_NO_NODE; +} #endif extern int hwpoison_filter(struct page *p); diff --git a/mm/migrate.c b/mm/migrate.c index 7e240437e7d9..57aeb9b491da 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1099,6 +1099,80 @@ out: return rc; } + +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { 1, // Node 0 migrates to 1 + * 2, // Node 1 migrates to 2 + * -1, // Node 2 does not migrate + * 4, // Node 3 migrates to 4 + * 5, // Node 4 migrates to 5 + * -1} // Node 5 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +static int node_demotion[MAX_NUMNODES] __read_mostly = + {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * @returns: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + int target; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target = READ_ONCE(node_demotion[node]); + rcu_read_unlock(); + + return target; +} + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -2982,3 +3056,145 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ + +/* Disable reclaim-based migration. */ +static void __disable_all_migrate_targets(void) +{ + int node; + + for_each_online_node(node) + node_demotion[node] = NUMA_NO_NODE; +} + +static void disable_all_migrate_targets(void) +{ + __disable_all_migrate_targets(); + + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + * + * The before+after state together might have cycles and + * could cause readers to do things like loop until this + * function finishes. This ensures they can only see a + * single "bad" read and would, for instance, only loop + * once. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for 'node'. + * Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static int establish_migrate_target(int node, nodemask_t *used) +{ + int migration_target; + + /* + * Can not set a migration target on a + * node with it already set. + * + * No need for READ_ONCE() here since this + * in the write path for node_demotion[]. + * This should be the only thread writing. + */ + if (node_demotion[node] != NUMA_NO_NODE) + return NUMA_NO_NODE; + + migration_target = find_next_best_node(node, used); + if (migration_target == NUMA_NO_NODE) + return NUMA_NO_NODE; + + node_demotion[node] = migration_target; + + return migration_target; +} + +/* + * When memory fills up on a node, memory contents can be + * automatically migrated to another node instead of + * discarded at reclaim. + * + * Establish a "migration path" which will start at nodes + * with CPUs and will follow the priorities used to build the + * page allocator zonelists. + * + * The difference here is that cycles must be avoided. If + * node0 migrates to node1, then neither node1, nor anything + * node1 migrates to can migrate to node0. + * + * This function can run simultaneously with readers of + * node_demotion[]. However, it can not run simultaneously + * with itself. Exclusion is provided by memory hotplug events + * being single-threaded. + */ +static void __set_migration_target_nodes(void) +{ + nodemask_t next_pass = NODE_MASK_NONE; + nodemask_t this_pass = NODE_MASK_NONE; + nodemask_t used_targets = NODE_MASK_NONE; + int node; + + /* + * Avoid any oddities like cycles that could occur + * from changes in the topology. This will leave + * a momentary gap when migration is disabled. + */ + disable_all_migrate_targets(); + + /* + * Allocations go close to CPUs, first. Assume that + * the migration path starts at the nodes with CPUs. + */ + next_pass = node_states[N_CPU]; +again: + this_pass = next_pass; + next_pass = NODE_MASK_NONE; + /* + * To avoid cycles in the migration "graph", ensure + * that migration sources are not future targets by + * setting them in 'used_targets'. Do this only + * once per pass so that multiple source nodes can + * share a target node. + * + * 'used_targets' will become unavailable in future + * passes. This limits some opportunities for + * multiple source nodes to share a destination. + */ + nodes_or(used_targets, used_targets, this_pass); + for_each_node_mask(node, this_pass) { + int target_node = establish_migrate_target(node, &used_targets); + + if (target_node == NUMA_NO_NODE) + continue; + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } + /* + * 'next_pass' contains nodes which became migration + * targets in this pass. Make additional passes until + * no more migrations targets are available. + */ + if (!nodes_empty(next_pass)) + goto again; +} + +/* + * For callers that do not hold get_online_mems() already. + */ +__maybe_unused // <- temporay to prevent warnings during bisects +static void set_migration_target_nodes(void) +{ + get_online_mems(); + __set_migration_target_nodes(); + put_online_mems(); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa936efad7e..cafdca874e0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6157,7 +6157,7 @@ static int node_load[MAX_NUMNODES]; * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ -static int find_next_best_node(int node, nodemask_t *used_node_mask) +int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; -- cgit v1.2.3 From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:13 -0700 Subject: mm/migrate: enable returning precise migrate_pages() success count Under normal circumstances, migrate_pages() returns the number of pages migrated. In error conditions, it returns an error code. When returning an error code, there is no way to know how many pages were migrated or not migrated. Make migrate_pages() return how many pages are demoted successfully for all cases, including when encountering errors. Page reclaim behavior will depend on this in subsequent patches. Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Oscar Salvador [optional parameter] Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 +++-- mm/compaction.c | 2 +- mm/gup.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 11 ++++++++--- mm/page_alloc.c | 2 +- 8 files changed, 18 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 23dadf7aeba8..8ab88d46318e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); @@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping, static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, - int reason) + int reason, unsigned int *ret_succeeded) { return -ENOSYS; } static inline struct page *alloc_migration_target(struct page *page, unsigned long private) diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..61fb64f47a06 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION); + MR_COMPACTION, NULL); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); diff --git a/mm/gup.c b/mm/gup.c index 1c7f4ec6990b..9935a4480710 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN); + MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f925615e573..517789b03961 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page) if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { bool release = !huge; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874..4c527a80b6c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e90274..939eabcaf488 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 0c12af203b68..ae923e9b8874 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of pages migrated successfully if + * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. @@ -1439,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; int thp_retry = 1; @@ -1594,6 +1596,9 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + return rc; } @@ -1663,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; @@ -2178,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cafdca874e0d..f95e1d2386a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8990,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migration_target, - NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless -- cgit v1.2.3 From 859a85ddf90e714092dea71a0e54c7b9896621be Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 7 Sep 2021 19:54:52 -0700 Subject: mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE". After recent updates to freeing unused parts of the memory map, no architecture can have holes in the memory map within a pageblock. This makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration option redundant. The first patch removes them both in a mechanical way and the second patch simplifies memory_hotplug::test_pages_in_a_zone() that had pfn_valid_within() surrounded by more logic than simple if. This patch (of 2): After recent changes in freeing of the unused parts of the memory map and rework of pfn_valid() in arm and arm64 there are no architectures that can have holes in the memory map within a pageblock and so nothing can enable CONFIG_HOLES_IN_ZONE which guards non trivial implementation of pfn_valid_within(). With that, pfn_valid_within() is always hardwired to 1 and can be completely removed. Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE. Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 -- include/linux/mmzone.h | 12 ------------ mm/Kconfig | 3 --- mm/compaction.c | 20 +++++++------------- mm/memory_hotplug.c | 4 ---- mm/page_alloc.c | 24 ++---------------------- mm/page_isolation.c | 7 +------ mm/page_owner.c | 14 +------------- 8 files changed, 11 insertions(+), 75 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 4a4ae868ad9f..8ec6b7dfbb0f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -768,8 +768,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int __ref get_nid_for_pfn(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..ee3a86830519 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1525,18 +1525,6 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -/* - * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validity within that MAX_ORDER_NR_PAGES block. - * pfn_valid_within() should be used in this case; we optimise this away - * when we have no holes within a MAX_ORDER_NR_PAGES block. - */ -#ifdef CONFIG_HOLES_IN_ZONE -#define pfn_valid_within(pfn) pfn_valid(pfn) -#else -#define pfn_valid_within(pfn) (1) -#endif - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 40a9bfcd5062..14d5d2837737 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -96,9 +96,6 @@ config HAVE_FAST_GUP depends on MMU bool -config HOLES_IN_ZONE - bool - # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..ed37e1cb4369 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, * is necessary for the block to be a migration source/target. */ do { - if (pfn_valid_within(pfn)) { - if (check_source && PageLRU(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } - if (check_target && PageBuddy(page)) { - clear_pageblock_skip(page); - return true; - } + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; } page += (1 << PAGE_ALLOC_COSTLY_ORDER); @@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; nr_scanned++; - if (!pfn_valid_within(blockpfn)) - goto isolate_fail; /* * For compound pages such as THP and hugetlbfs, we can save @@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, cond_resched(); } - if (!pfn_valid_within(low_pfn)) - goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874..8d3376f66f01 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1308,10 +1308,6 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && - !pfn_valid_within(pfn + i)) - i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; /* Check if we got outside of the zone */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eeb3a9cb36bb..79a2fc5b6c6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { - if (!pfn_valid_within(page_to_pfn(page))) - return 0; if (zone != page_zone(page)) return 0; @@ -1025,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, if (order >= MAX_ORDER - 2) return false; - if (!pfn_valid_within(buddy_pfn)) - return false; - combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - return pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1); + return page_is_buddy(higher_page, higher_buddy, order + 1); } /* @@ -1095,8 +1089,6 @@ continue_merging: buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (!pfn_valid_within(buddy_pfn)) - goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -1754,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. + * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * @@ -1872,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!pfn_valid_within(pfn)) - return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; return true; @@ -2520,11 +2508,6 @@ static int move_freepages(struct zone *zone, int pages_moved = 0; for (pfn = start_pfn; pfn <= end_pfn;) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } - page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* @@ -8814,9 +8797,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, } for (; iter < pageblock_nr_pages - offset; iter++) { - if (!pfn_valid_within(pfn + iter)) - continue; - page = pfn_to_page(pfn + iter); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bddf788f45bf..471e3a13b541 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(buddy_pfn) && - !is_migrate_isolate_page(buddy)) { + if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; } @@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, struct page *page; while (pfn < end_pfn) { - if (!pfn_valid_within(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); if (PageBuddy(page)) /* diff --git a/mm/page_owner.c b/mm/page_owner.c index f51a57e92aa3..62402d22539b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); @@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - /* Check for holes within a MAX_ORDER area */ - if (!pfn_valid_within(pfn)) - continue; - page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); @@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { - struct page *page; + struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) continue; -- cgit v1.2.3 From 4b0970024408afb17886e0c76e9761c4264db2a8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:19 -0700 Subject: mm: track present early pages per zone Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3. I. Goal The goal of this series is improving in-kernel auto-online support. It tackles the fundamental problems that: 1) We can create zone imbalances when onlining all memory blindly to ZONE_MOVABLE, in the worst case crashing the system. We have to know upfront how much memory we are going to hotplug such that we can safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE via "online_movable". This is far from practical and only applicable in limited setups -- like inside VMs under the RHV/oVirt hypervisor which will never hotplug more than 3 times the boot memory (and the limitation is only in place due to the Linux limitation). 2) We see more setups that implement dynamic VM resizing, hot(un)plugging memory to resize VM memory. In these setups, we might hotplug a lot of memory, but it might happen in various small steps in both directions (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the primary driver of this upstream right now, performing such dynamic resizing NUMA-aware via multiple virtio-mem devices. Onlining all hotplugged memory to ZONE_NORMAL means we basically have no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can easily run into zone imbalances when growing a VM. We want a mixture, and we want as much memory as reasonable/configured in ZONE_MOVABLE. Details regarding zone imbalances can be found at [1]. 3) Memory devices consist of 1..X memory block devices, however, the kernel doesn't really track the relationship. Consequently, also user space has no idea. We want to make per-device decisions. As one example, for memory hotunplug it doesn't make sense to use a mixture of zones within a single DIMM: we want all MOVABLE if possible, otherwise all !MOVABLE, because any !MOVABLE part will easily block the whole DIMM from getting hotunplugged. As another example, virtio-mem operates on individual units that span 1..X memory blocks. Similar to a DIMM, we want a unit to either be all MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however, all units of a virtio-mem device logically belong together and are managed (added/removed) by a single driver. We want as much memory of a virtio-mem device to be MOVABLE as possible. 4) We want memory onlining to be done right from the kernel while adding memory, not triggered by user space via udev rules; for example, this is reqired for fast memory hotplug for drivers that add individual memory blocks, like virito-mem. We want a way to configure a policy in the kernel and avoid implementing advanced policies in user space. The auto-onlining support we have in the kernel is not sufficient. All we have is a) online everything MOVABLE (online_movable) b) online everything !MOVABLE (online_kernel) c) keep zones contiguous (online). This series allows configuring c) to mean instead "online movable if possible according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio" -- a new onlining policy. II. Approach This series does 3 things: 1) Introduces the "auto-movable" online policy that initially operates on individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio to make a decision whether a memory block will be onlined to ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL memory does not allow for more MOVABLE memory (details in the patches). CMA memory is treated like MOVABLE memory. 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory groups and uses group information to make decisions in the "auto-movable" online policy across memory blocks of a single memory device (modeled as memory group). More details can be found in patch #3 or in the DIMM example below. 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by allowing ZONE_NORMAL memory within a dynamic memory group to allow for more ZONE_MOVABLE memory within the same memory group. The target use case is dynamic VM resizing using virtio-mem. See the virtio-mem example below. I remember that the basic idea of using a ratio to implement a policy in the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I lost the pointer to that discussion). For me, the main use case is using it along with virtio-mem (and DIMMs / ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the amount of memory we can hotunplug reliably again if we might eventually hotplug a lot of memory to a VM. III. Target Usage The target usage will be: 1) Linux boots with "mhp_default_online_type=offline" 2) User space (e.g., systemd unit) configures memory onlining (according to a config file and system properties), for example: * Setting memory_hotplug.online_policy=auto-movable * Setting memory_hotplug.auto_movable_ratio=301 * Setting memory_hotplug.auto_movable_numa_aware=true 3) User space enabled auto onlining via "echo online > /sys/devices/system/memory/auto_online_blocks" 4) User space triggers manual onlining of all already-offline memory blocks (go over offline memory blocks and set them to "online") IV. Example For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of 301% results in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-79: Movable (DIMM 0) Memory block 80-111: Movable (DIMM 1) Memory block 112-143: Movable (DIMM 2) Memory block 144-275: Normal (DIMM 3) Memory block 176-207: Normal (DIMM 4) ... all Normal (-> hotplugged Normal memory does not allow for more Movable memory) For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM will result in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-143: Movable (virtio-mem, first 12 GiB) Memory block 144: Normal (virtio-mem, next 128 MiB) Memory block 145-147: Movable (virtio-mem, next 384 MiB) Memory block 148: Normal (virtio-mem, next 128 MiB) Memory block 149-151: Movable (virtio-mem, next 384 MiB) ... Normal/Movable mixture as above (-> hotplugged Normal memory allows for more Movable memory within the same device) Which gives us maximum flexibility when dynamically growing/shrinking a VM in smaller steps. V. Doc Update I'll update the memory-hotplug.rst documentation, once the overhaul [1] is usptream. Until then, details can be found in patch #2. VI. Future Work 1) Use memory groups for ppc64 dlpar 2) Being able to specify a portion of (early) kernel memory that will be excluded from the ratio. Like "128 MiB globally/per node" are excluded. This might be helpful when starting VMs with extremely small memory footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting the first hotplugged units getting onlined to ZONE_MOVABLE. One alternative would be a trigger to not consider ZONE_DMA memory in the ratio. We'll have to see if this is really rrequired. 3) Indicate to user space that MOVABLE might be a bad idea -- especially relevant when memory ballooning without support for balloon compaction is active. This patch (of 9): For implementing a new memory onlining policy, which determines when to online memory blocks to ZONE_MOVABLE semi-automatically, we need the number of present early (boot) pages -- present pages excluding hotplugged pages. Let's track these pages per zone. Pass a page instead of the zone to adjust_present_page_count(), similar as adjust_managed_page_count() and derive the zone from the page. It's worth noting that a memory block to be offlined/onlined is either completely "early" or "not early". add_memory() and friends can only add complete memory blocks and we only online/offline complete (individual) memory blocks. Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Marek Kedzierski Cc: Hui Zhu Cc: Pankaj Gupta Cc: Wei Yang Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 14 +++++++------- include/linux/memory_hotplug.h | 2 +- include/linux/mmzone.h | 7 +++++++ mm/memory_hotplug.c | 14 +++++++++++--- mm/page_alloc.c | 3 +++ 5 files changed, 29 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index aa31a21f33d7..86ec2dc82fc2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -205,7 +205,8 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } @@ -215,24 +216,23 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; - struct zone *zone; int ret; /* * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ - if (nr_vmemmap_pages) { - zone = page_zone(pfn_to_page(start_pfn)); - adjust_present_page_count(zone, -nr_vmemmap_pages); - } + if (nr_vmemmap_pages) + adjust_present_page_count(pfn_to_page(start_pfn), + -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, nr_pages - nr_vmemmap_pages); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) - adjust_present_page_count(zone, nr_vmemmap_pages); + adjust_present_page_count(pfn_to_page(start_pfn), + nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 068e3dcf4690..39b04e99a30e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct zone *zone, long nr_pages); +extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ee3a86830519..1c0e3bf42521 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -540,6 +540,10 @@ struct zone { * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * + * present_early_pages is present pages existing within the zone + * located on memory available since early boot, excluding hotplugged + * memory. + * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): @@ -572,6 +576,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#if defined(CONFIG_MEMORY_HOTPLUG) + unsigned long present_early_pages; +#endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6ea62efe2a8f..8a99fa6d096c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -724,8 +724,16 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct zone *zone, long nr_pages) +void adjust_present_page_count(struct page *page, long nr_pages) { + struct zone *zone = page_zone(page); + + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; } @@ -826,7 +834,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(zone, nr_pages); + adjust_present_page_count(pfn_to_page(pfn), nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1697,7 +1705,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(zone, -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79a2fc5b6c6f..9353418892a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7240,6 +7240,9 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; +#if defined(CONFIG_MEMORY_HOTPLUG) + zone->present_early_pages = real_size; +#endif totalpages += size; realtotalpages += real_size; -- cgit v1.2.3 From 053cfda102306a3394012f9fe2594811c34925e4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 8 Sep 2021 18:10:11 -0700 Subject: mm/page_alloc.c: avoid accessing uninitialized pcp page migratetype If it's not prepared to free unref page, the pcp page migratetype is unset. Thus we will get rubbish from get_pcppage_migratetype() and might list_del(&page->lru) again after it's already deleted from the list leading to grumble about data corruption. Link: https://lkml.kernel.org/r/20210902115447.57050-1-linmiaohe@huawei.com Fixes: df1acc856923 ("mm/page_alloc: avoid conflating IRQs disabled with zone->lock") Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de309a1dfe65..b37435c274cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,8 +3428,10 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) + if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); + continue; + } /* * Free isolated pages directly to the allocator, see -- cgit v1.2.3