From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 2 Sep 2021 14:52:58 -0700
Subject: mm: report a more useful address for reclaim acquisition

A recent lockdep report included these lines:

[   96.177910] 3 locks held by containerd/770:
[   96.177934]  #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3},
at: do_user_addr_fault+0x115/0x770
[   96.177999]  #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at:
get_swap_device+0x33/0x140
[   96.178057]  #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at:
__fs_reclaim_acquire+0x5/0x30

While it was not useful to that bug report to know where the reclaim lock
had been acquired, it might be useful under other circumstances.  Allow
the caller of __fs_reclaim_acquire to specify the instruction pointer to
use.

Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eeb3a9cb36bb..51c17bf7b127 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask)
 	return true;
 }
 
-void __fs_reclaim_acquire(void)
+void __fs_reclaim_acquire(unsigned long ip)
 {
-	lock_map_acquire(&__fs_reclaim_map);
+	lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
 }
 
-void __fs_reclaim_release(void)
+void __fs_reclaim_release(unsigned long ip)
 {
-	lock_map_release(&__fs_reclaim_map);
+	lock_release(&__fs_reclaim_map, ip);
 }
 
 void fs_reclaim_acquire(gfp_t gfp_mask)
@@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask)
 
 	if (__need_reclaim(gfp_mask)) {
 		if (gfp_mask & __GFP_FS)
-			__fs_reclaim_acquire();
+			__fs_reclaim_acquire(_RET_IP_);
 
 #ifdef CONFIG_MMU_NOTIFIER
 		lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
@@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask)
 
 	if (__need_reclaim(gfp_mask)) {
 		if (gfp_mask & __GFP_FS)
-			__fs_reclaim_release();
+			__fs_reclaim_release(_RET_IP_);
 	}
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_release);
-- 
cgit v1.2.3


From eb2169cee36fc492407a2d483c286b977a46288a Mon Sep 17 00:00:00 2001
From: liuhailong <liuhailong@oppo.com>
Date: Thu, 2 Sep 2021 14:53:01 -0700
Subject: mm: add kernel_misc_reclaimable in show_free_areas

Print NR_KERNEL_MISC_RECLAIMABLE stat from show_free_areas() so users can
check whether the shrinker is working correctly and to show the current
memory usage.

Link: https://lkml.kernel.org/r/20210813104725.4562-1-liuhailong@oppo.com
Signed-off-by: liuhailong <liuhailong@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 51c17bf7b127..60a867a549bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		" unevictable:%lu dirty:%lu writeback:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+		" kernel_misc_reclaimable:%lu\n"
 		" free:%lu free_pcp:%lu free_cma:%lu\n",
 		global_node_page_state(NR_ACTIVE_ANON),
 		global_node_page_state(NR_INACTIVE_ANON),
@@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state(NR_SHMEM),
 		global_node_page_state(NR_PAGETABLE),
 		global_zone_page_state(NR_BOUNCE),
+		global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
 		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
 		global_zone_page_state(NR_FREE_CMA_PAGES));
-- 
cgit v1.2.3


From c3ab6baf6a004eab7344a1d8880a971f2414e1b6 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 2 Sep 2021 14:57:56 -0700
Subject: mm/page_alloc: always initialize memory map for the holes

Patch series "mm: ensure consistency of memory map poisoning".

Currently memory map allocation for FLATMEM case does not poison the
struct pages regardless of CONFIG_PAGE_POISON setting.

This happens because allocation of the memory map for FLATMEM and SPARSMEM
use different memblock functions and those that are used for SPARSMEM case
(namely memblock_alloc_try_nid_raw() and memblock_alloc_exact_nid_raw())
implicitly poison the allocated memory.

Another side effect of this implicit poisoning is that early setup code
that uses the same functions to allocate memory burns cycles for the
memory poisoning even if it was not intended.

These patches introduce memmap_alloc() wrapper that ensure that the memory
map allocation is consistent for different memory models.

This patch (of 4):

Currently memory map for the holes is initialized only when SPARSEMEM
memory model is used.  Yet, even with FLATMEM there could be holes in the
physical memory layout that have memory map entries.

For instance, the memory reserved using e820 API on i386 or
"reserved-memory" nodes in device tree would not appear in memblock.memory
and hence the struct pages for such holes will be skipped during memory
map initialization.

These struct pages will be zeroed because the memory map for FLATMEM
systems is allocated with memblock_alloc_node() that clears the allocated
memory.  While zeroed struct pages do not cause immediate problems, the
correct behaviour is to initialize every page using __init_single_page().
Besides, enabling page poison for FLATMEM case will trigger
PF_POISONED_CHECK() unless the memory map is properly initialized.

Make sure init_unavailable_range() is called for both SPARSEMEM and
FLATMEM so that struct pages representing memory holes would appear as
PG_Reserved with any memory layout.

[rppt@kernel.org: fix microblaze]
  Link: https://lkml.kernel.org/r/YQWW3RCE4eWBuMu/@kernel.org

Link: https://lkml.kernel.org/r/20210714123739.16493-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20210714123739.16493-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/microblaze/include/asm/page.h | 3 +--
 mm/page_alloc.c                    | 8 --------
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index ce550978f4fc..4b8b2fa78fc5 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn);
 #  define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
 
 #  define ARCH_PFN_OFFSET	(memory_start >> PAGE_SHIFT)
-#  define pfn_valid(pfn)	((pfn) < (max_mapnr + ARCH_PFN_OFFSET))
-
+#  define pfn_valid(pfn)	((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET))
 # endif /* __ASSEMBLY__ */
 
 #define	virt_addr_valid(vaddr)	(pfn_valid(virt_to_pfn(vaddr)))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60a867a549bd..bd2efd6287ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6642,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-#if !defined(CONFIG_FLATMEM)
 /*
  * Only struct pages that correspond to ranges defined by memblock.memory
  * are zeroed and initialized by going through __init_single_page() during
@@ -6687,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn,
 		pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
 			node, zone_names[zone], pgcnt);
 }
-#else
-static inline void init_unavailable_range(unsigned long spfn,
-					  unsigned long epfn,
-					  int zone, int node)
-{
-}
-#endif
 
 static void __init memmap_init_zone_range(struct zone *zone,
 					  unsigned long start_pfn,
-- 
cgit v1.2.3


From c803b3c8b3b70f306ee6300bf8acdd70ffd1441a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 2 Sep 2021 14:58:02 -0700
Subject: mm: introduce memmap_alloc() to unify memory map allocation

There are several places that allocate memory for the memory map:
alloc_node_mem_map() for FLATMEM, sparse_buffer_init() and
__populate_section_memmap() for SPARSEMEM.

The memory allocated in the FLATMEM case is zeroed and it is never
poisoned, regardless of CONFIG_PAGE_POISON setting.

The memory allocated in the SPARSEMEM cases is not zeroed and it is
implicitly poisoned inside memblock if CONFIG_PAGE_POISON is set.

Introduce memmap_alloc() wrapper for memblock allocators that will be used
for both FLATMEM and SPARSEMEM cases and will makei memory map zeroing and
poisoning consistent for different memory models.

Link: https://lkml.kernel.org/r/20210714123739.16493-4-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   |  4 ++++
 mm/page_alloc.c | 24 ++++++++++++++++++++++--
 mm/sparse.c     |  6 ++----
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/internal.h b/mm/internal.h
index 31ff935b2547..57e28261a3b1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
 
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+			  phys_addr_t min_addr,
+			  int nid, bool exact_nid);
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd2efd6287ce..71ad97c96075 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6748,6 +6748,26 @@ static void __init memmap_init(void)
 		init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
 }
 
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+			  phys_addr_t min_addr, int nid, bool exact_nid)
+{
+	void *ptr;
+
+	if (exact_nid)
+		ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+						   MEMBLOCK_ALLOC_ACCESSIBLE,
+						   nid);
+	else
+		ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+						 MEMBLOCK_ALLOC_ACCESSIBLE,
+						 nid);
+
+	if (ptr && size > 0)
+		page_init_poison(ptr, size);
+
+	return ptr;
+}
+
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
@@ -7519,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
-		map = memblock_alloc_node(size, SMP_CACHE_BYTES,
-					  pgdat->node_id);
+		map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+				   pgdat->node_id, false);
 		if (!map)
 			panic("Failed to allocate %ld bytes for node %d memory map\n",
 			      size, pgdat->node_id);
diff --git a/mm/sparse.c b/mm/sparse.c
index fd29b3abffa0..120bc8ea5293 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -436,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
 	if (map)
 		return map;
 
-	map = memblock_alloc_try_nid_raw(size, size, addr,
-					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+	map = memmap_alloc(size, size, addr, nid, false);
 	if (!map)
 		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
 		      __func__, size, PAGE_SIZE, nid, &addr);
@@ -464,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
 	 * and we want it to be properly aligned to the section size - this is
 	 * especially the case for VMEMMAP which maps memmap to PMDs
 	 */
-	sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
-					addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+	sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
 	sparsemap_buf_end = sparsemap_buf + size;
 }
 
-- 
cgit v1.2.3


From b346075fcf5dda7f9e9ae671703aae60e8a94564 Mon Sep 17 00:00:00 2001
From: Nico Pache <npache@redhat.com>
Date: Thu, 2 Sep 2021 14:58:08 -0700
Subject: mm/page_alloc.c: fix 'zone_id' may be used uninitialized in this
 function warning

When compiling with -Werror, cc1 will warn that 'zone_id' may be used
uninitialized in this function warning.

Initialize the zone_id as 0.

Its safe to assume that if the code reaches this point it has at least one
numa node with memory, so no need for an assertion before
init_unavilable_range.

Link: https://lkml.kernel.org/r/20210716210336.1114114-1-npache@redhat.com
Fixes: 122e093c1734 ("mm/page_alloc: fix memory map initialization for descending nodes")
Signed-off-by: Nico Pache <npache@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 71ad97c96075..04021e37120e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6715,7 +6715,7 @@ static void __init memmap_init(void)
 {
 	unsigned long start_pfn, end_pfn;
 	unsigned long hole_pfn = 0;
-	int i, j, zone_id, nid;
+	int i, j, zone_id = 0, nid;
 
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		struct pglist_data *node = NODE_DATA(nid);
-- 
cgit v1.2.3


From 3b446da6be7a722d769e23f68dbaf4ebb2eda542 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 2 Sep 2021 14:58:10 -0700
Subject: mm/page_alloc: make alloc_node_mem_map() __init rather than __ref

alloc_node_mem_map() is never only called from free_area_init_node() that
is an __init function.

Make the actual alloc_node_mem_map() also __init and its stub version
static inline.

Link: https://lkml.kernel.org/r/20210716064124.31865-1-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 04021e37120e..05079b4ae7bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7515,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 }
 
 #ifdef CONFIG_FLATMEM
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long __maybe_unused start = 0;
 	unsigned long __maybe_unused offset = 0;
@@ -7561,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 #endif
 }
 #else
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
 #endif /* CONFIG_FLATMEM */
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-- 
cgit v1.2.3


From 88dc6f208829cfdbc0b96495c5c73a6af0559300 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 2 Sep 2021 14:58:13 -0700
Subject: mm/page_alloc.c: use in_task()

Obsoleted in_intrrupt() include task context with disabled BH, it's better
to use in_task() instead.

Link: https://lkml.kernel.org/r/877caa99-1994-5545-92d2-d0bb2e394182@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05079b4ae7bc..eaa936efad7e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 		if (tsk_is_oom_victim(current) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
-	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+	if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
 	show_mem(filter, nodemask);
@@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 		 * comment for __cpuset_node_allowed().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
-	} else if (unlikely(rt_task(current)) && !in_interrupt())
+	} else if (unlikely(rt_task(current)) && in_task())
 		alloc_flags |= ALLOC_HARDER;
 
 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
@@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		 * When we are in the interrupt context, it is irrelevant
 		 * to the current task context. It means that any node ok.
 		 */
-		if (!in_interrupt() && !ac->nodemask)
+		if (in_task() && !ac->nodemask)
 			ac->nodemask = &cpuset_current_mems_allowed;
 		else
 			*alloc_flags |= ALLOC_CPUSET;
-- 
cgit v1.2.3


From 79c28a41672278283fa72e03d0bf80e6644d4ac4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 2 Sep 2021 14:59:06 -0700
Subject: mm/numa: automatically generate node migration order

Patch series "Migrate Pages in lieu of discard", v11.

We're starting to see systems with more and more kinds of memory such as
Intel's implementation of persistent memory.

Let's say you have a system with some DRAM and some persistent memory.
Today, once DRAM fills up, reclaim will start and some of the DRAM
contents will be thrown out.  Allocations will, at some point, start
falling over to the slower persistent memory.

That has two nasty properties.  First, the newer allocations can end up in
the slower persistent memory.  Second, reclaimed data in DRAM are just
discarded even if there are gobs of space in persistent memory that could
be used.

This patchset implements a solution to these problems.  At the end of the
reclaim process in shrink_page_list() just before the last page refcount
is dropped, the page is migrated to persistent memory instead of being
dropped.

While I've talked about a DRAM/PMEM pairing, this approach would function
in any environment where memory tiers exist.

This is not perfect.  It "strands" pages in slower memory and never brings
them back to fast DRAM.  Huang Ying has follow-on work which repurposes
NUMA balancing to promote hot pages back to DRAM.

This is also all based on an upstream mechanism that allows persistent
memory to be onlined and used as if it were volatile:

	http://lkml.kernel.org/r/20190124231441.37A4A305@viggo.jf.intel.com

With that, the DRAM and PMEM in each socket will be represented as 2
separate NUMA nodes, with the CPUs sit in the DRAM node.  So the
general inter-NUMA demotion mechanism introduced in the patchset can
migrate the cold DRAM pages to the PMEM node.

We have tested the patchset with the postgresql and pgbench.  On a
2-socket server machine with DRAM and PMEM, the kernel with the patchset
can improve the score of pgbench up to 22.1% compared with that of the
DRAM only + disk case.  This comes from the reduced disk read throughput
(which reduces up to 70.8%).

== Open Issues ==

 * Memory policies and cpusets that, for instance, restrict allocations
   to DRAM can be demoted to PMEM whenever they opt in to this
   new mechanism.  A cgroup-level API to opt-in or opt-out of
   these migrations will likely be required as a follow-on.
 * Could be more aggressive about where anon LRU scanning occurs
   since it no longer necessarily involves I/O.  get_scan_count()
   for instance says: "If we have no swap space, do not bother
   scanning anon pages"

This patch (of 9):

Prepare for the kernel to auto-migrate pages to other memory nodes with a
node migration table.  This allows creating single migration target for
each NUMA node to enable the kernel to do NUMA page migrations instead of
simply discarding colder pages.  A node with no target is a "terminal
node", so reclaim acts normally there.  The migration target does not
fundamentally _need_ to be a single node, but this implementation starts
there to limit complexity.

When memory fills up on a node, memory contents can be automatically
migrated to another node.  The biggest problems are knowing when to
migrate and to where the migration should be targeted.

The most straightforward way to generate the "to where" list would be to
follow the page allocator fallback lists.  Those lists already tell us if
memory is full where to look next.  It would also be logical to move
memory in that order.

But, the allocator fallback lists have a fatal flaw: most nodes appear in
all the lists.  This would potentially lead to migration cycles (A->B,
B->A, A->B, ...).

Instead of using the allocator fallback lists directly, keep a separate
node migration ordering.  But, reuse the same data used to generate page
allocator fallback in the first place: find_next_best_node().

This means that the firmware data used to populate node distances
essentially dictates the ordering for now.  It should also be
architecture-neutral since all NUMA architectures have a working
find_next_best_node().

RCU is used to allow lock-less read of node_demotion[] and prevent
demotion cycles been observed.  If multiple reads of node_demotion[] are
performed, a single rcu_read_lock() must be held over all reads to ensure
no cycles are observed.  Details are as follows.

=== What does RCU provide? ===

Imagine a simple loop which walks down the demotion path looking
for the last node:

        terminal_node = start_node;
        while (node_demotion[terminal_node] != NUMA_NO_NODE) {
                terminal_node = node_demotion[terminal_node];
        }

The initial values are:

        node_demotion[0] = 1;
        node_demotion[1] = NUMA_NO_NODE;

and are updated to:

        node_demotion[0] = NUMA_NO_NODE;
        node_demotion[1] = 0;

What guarantees that the cycle is not observed:

        node_demotion[0] = 1;
        node_demotion[1] = 0;

and would loop forever?

With RCU, a rcu_read_lock/unlock() can be placed around the loop.  Since
the write side does a synchronize_rcu(), the loop that observed the old
contents is known to be complete before the synchronize_rcu() has
completed.

RCU, combined with disable_all_migrate_targets(), ensures that the old
migration state is not visible by the time __set_migration_target_nodes()
is called.

=== What does READ_ONCE() provide? ===

READ_ONCE() forbids the compiler from merging or reordering successive
reads of node_demotion[].  This ensures that any updates are *eventually*
observed.

Consider the above loop again.  The compiler could theoretically read the
entirety of node_demotion[] into local storage (registers) and never go
back to memory, and *permanently* observe bad values for node_demotion[].

Note: RCU does not provide any universal compiler-ordering
guarantees:

	https://lore.kernel.org/lkml/20150921204327.GH4029@linux.vnet.ibm.com/

This code is unused for now.  It will be called later in the
series.

Link: https://lkml.kernel.org/r/20210721063926.3024591-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-2-ying.huang@intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Xu <weixugc@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   |   5 ++
 mm/migrate.c    | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c |   2 +-
 3 files changed, 222 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/internal.h b/mm/internal.h
index 57e28261a3b1..cf3cb933eba3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -543,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 
 #ifdef CONFIG_NUMA
 extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
 #else
 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
 				unsigned int order)
 {
 	return NODE_RECLAIM_NOSCAN;
 }
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+	return NUMA_NO_NODE;
+}
 #endif
 
 extern int hwpoison_filter(struct page *p);
diff --git a/mm/migrate.c b/mm/migrate.c
index 7e240437e7d9..57aeb9b491da 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1099,6 +1099,80 @@ out:
 	return rc;
 }
 
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *	Socket A: 0, 1, 2
+ *	Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *	0 -> 1 -> 2 -> stop
+ *	3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *	{  1, // Node 0 migrates to 1
+ *	   2, // Node 1 migrates to 2
+ *	  -1, // Node 2 does not migrate
+ *	   4, // Node 3 migrates to 4
+ *	   5, // Node 4 migrates to 5
+ *	  -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+	{[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * @returns: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+	int target;
+
+	/*
+	 * node_demotion[] is updated without excluding this
+	 * function from running.  RCU doesn't provide any
+	 * compiler barriers, so the READ_ONCE() is required
+	 * to avoid compiler reordering or read merging.
+	 *
+	 * Make sure to use RCU over entire code blocks if
+	 * node_demotion[] reads need to be consistent.
+	 */
+	rcu_read_lock();
+	target = READ_ONCE(node_demotion[node]);
+	rcu_read_unlock();
+
+	return target;
+}
+
 /*
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
@@ -2982,3 +3056,145 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
+
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+	int node;
+
+	for_each_online_node(node)
+		node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+	__disable_all_migrate_targets();
+
+	/*
+	 * Ensure that the "disable" is visible across the system.
+	 * Readers will see either a combination of before+disable
+	 * state or disable+after.  They will never see before and
+	 * after state together.
+	 *
+	 * The before+after state together might have cycles and
+	 * could cause readers to do things like loop until this
+	 * function finishes.  This ensures they can only see a
+	 * single "bad" read and would, for instance, only loop
+	 * once.
+	 */
+	synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK.  It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+	int migration_target;
+
+	/*
+	 * Can not set a migration target on a
+	 * node with it already set.
+	 *
+	 * No need for READ_ONCE() here since this
+	 * in the write path for node_demotion[].
+	 * This should be the only thread writing.
+	 */
+	if (node_demotion[node] != NUMA_NO_NODE)
+		return NUMA_NO_NODE;
+
+	migration_target = find_next_best_node(node, used);
+	if (migration_target == NUMA_NO_NODE)
+		return NUMA_NO_NODE;
+
+	node_demotion[node] = migration_target;
+
+	return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided.  If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[].  However, it can not run simultaneously
+ * with itself.  Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+	nodemask_t next_pass	= NODE_MASK_NONE;
+	nodemask_t this_pass	= NODE_MASK_NONE;
+	nodemask_t used_targets = NODE_MASK_NONE;
+	int node;
+
+	/*
+	 * Avoid any oddities like cycles that could occur
+	 * from changes in the topology.  This will leave
+	 * a momentary gap when migration is disabled.
+	 */
+	disable_all_migrate_targets();
+
+	/*
+	 * Allocations go close to CPUs, first.  Assume that
+	 * the migration path starts at the nodes with CPUs.
+	 */
+	next_pass = node_states[N_CPU];
+again:
+	this_pass = next_pass;
+	next_pass = NODE_MASK_NONE;
+	/*
+	 * To avoid cycles in the migration "graph", ensure
+	 * that migration sources are not future targets by
+	 * setting them in 'used_targets'.  Do this only
+	 * once per pass so that multiple source nodes can
+	 * share a target node.
+	 *
+	 * 'used_targets' will become unavailable in future
+	 * passes.  This limits some opportunities for
+	 * multiple source nodes to share a destination.
+	 */
+	nodes_or(used_targets, used_targets, this_pass);
+	for_each_node_mask(node, this_pass) {
+		int target_node = establish_migrate_target(node, &used_targets);
+
+		if (target_node == NUMA_NO_NODE)
+			continue;
+
+		/*
+		 * Visit targets from this pass in the next pass.
+		 * Eventually, every node will have been part of
+		 * a pass, and will become set in 'used_targets'.
+		 */
+		node_set(target_node, next_pass);
+	}
+	/*
+	 * 'next_pass' contains nodes which became migration
+	 * targets in this pass.  Make additional passes until
+	 * no more migrations targets are available.
+	 */
+	if (!nodes_empty(next_pass))
+		goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+__maybe_unused // <- temporay to prevent warnings during bisects
+static void set_migration_target_nodes(void)
+{
+	get_online_mems();
+	__set_migration_target_nodes();
+	put_online_mems();
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa936efad7e..cafdca874e0d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6157,7 +6157,7 @@ static int node_load[MAX_NUMNODES];
  *
  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
  */
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
-- 
cgit v1.2.3


From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 2 Sep 2021 14:59:13 -0700
Subject: mm/migrate: enable returning precise migrate_pages() success count

Under normal circumstances, migrate_pages() returns the number of pages
migrated.  In error conditions, it returns an error code.  When returning
an error code, there is no way to know how many pages were migrated or not
migrated.

Make migrate_pages() return how many pages are demoted successfully for
all cases, including when encountering errors.  Page reclaim behavior will
depend on this in subsequent patches.

Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Oscar Salvador <osalvador@suse.de> [optional parameter]
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  5 +++--
 mm/compaction.c         |  2 +-
 mm/gup.c                |  2 +-
 mm/memory-failure.c     |  2 +-
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            | 11 ++++++++---
 mm/page_alloc.c         |  2 +-
 8 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 23dadf7aeba8..8ab88d46318e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping,
 			struct page *newpage, struct page *page,
 			enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
-		unsigned long private, enum migrate_mode mode, int reason);
+		unsigned long private, enum migrate_mode mode, int reason,
+		unsigned int *ret_succeeded);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 
@@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t new,
 		free_page_t free, unsigned long private, enum migrate_mode mode,
-		int reason)
+		int reason, unsigned int *ret_succeeded)
 	{ return -ENOSYS; }
 static inline struct page *alloc_migration_target(struct page *page,
 		unsigned long private)
diff --git a/mm/compaction.c b/mm/compaction.c
index 621508e0ecd5..61fb64f47a06 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				compaction_free, (unsigned long)cc, cc->mode,
-				MR_COMPACTION);
+				MR_COMPACTION, NULL);
 
 		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
 							&cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 1c7f4ec6990b..9935a4480710 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 	if (!list_empty(&movable_page_list)) {
 		ret = migrate_pages(&movable_page_list, alloc_migration_target,
 				    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
-				    MR_LONGTERM_PIN);
+				    MR_LONGTERM_PIN, NULL);
 		if (ret && !list_empty(&movable_page_list))
 			putback_movable_pages(&movable_page_list);
 	}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2f925615e573..517789b03961 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page)
 
 	if (isolate_page(hpage, &pagelist)) {
 		ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
-			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
 		if (!ret) {
 			bool release = !huge;
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 86c3af79e874..4c527a80b6c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		if (nodes_empty(nmask))
 			node_set(mtc.nid, nmask);
 		ret = migrate_pages(&source, alloc_migration_target, NULL,
-			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+			(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
 		if (ret) {
 			list_for_each_entry(page, &source, lru) {
 				if (__ratelimit(&migrate_rs)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e32360e90274..939eabcaf488 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
-				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
 		if (err)
 			putback_movable_pages(&pagelist);
 	}
@@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
-				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 0c12af203b68..ae923e9b8874 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1429,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * @mode:		The migration mode that specifies the constraints for
  *			page migration, if any.
  * @reason:		The reason for page migration.
+ * @ret_succeeded:	Set to the number of pages migrated successfully if
+ *			the caller passes a non-NULL pointer.
  *
  * The function returns after 10 attempts or if no pages are movable any more
  * because the list has become empty or no retryable pages exist any more.
@@ -1439,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
 		free_page_t put_new_page, unsigned long private,
-		enum migrate_mode mode, int reason)
+		enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
 {
 	int retry = 1;
 	int thp_retry = 1;
@@ -1594,6 +1596,9 @@ out:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
+	if (ret_succeeded)
+		*ret_succeeded = nr_succeeded;
+
 	return rc;
 }
 
@@ -1663,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
 	};
 
 	err = migrate_pages(pagelist, alloc_migration_target, NULL,
-			(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+		(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
 	if (err)
 		putback_movable_pages(pagelist);
 	return err;
@@ -2178,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
 	list_add(&page->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
-				     MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+				     MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cafdca874e0d..f95e1d2386a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8990,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		cc->nr_migratepages -= nr_reclaimed;
 
 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,
-				NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
 
 		/*
 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
-- 
cgit v1.2.3


From 859a85ddf90e714092dea71a0e54c7b9896621be Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Tue, 7 Sep 2021 19:54:52 -0700
Subject: mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE

Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE".

After recent updates to freeing unused parts of the memory map, no
architecture can have holes in the memory map within a pageblock.  This
makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration
option redundant.

The first patch removes them both in a mechanical way and the second patch
simplifies memory_hotplug::test_pages_in_a_zone() that had
pfn_valid_within() surrounded by more logic than simple if.

This patch (of 2):

After recent changes in freeing of the unused parts of the memory map and
rework of pfn_valid() in arm and arm64 there are no architectures that can
have holes in the memory map within a pageblock and so nothing can enable
CONFIG_HOLES_IN_ZONE which guards non trivial implementation of
pfn_valid_within().

With that, pfn_valid_within() is always hardwired to 1 and can be
completely removed.

Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE.

Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  2 --
 include/linux/mmzone.h | 12 ------------
 mm/Kconfig             |  3 ---
 mm/compaction.c        | 20 +++++++-------------
 mm/memory_hotplug.c    |  4 ----
 mm/page_alloc.c        | 24 ++----------------------
 mm/page_isolation.c    |  7 +------
 mm/page_owner.c        | 14 +-------------
 8 files changed, 11 insertions(+), 75 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 4a4ae868ad9f..8ec6b7dfbb0f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -768,8 +768,6 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int __ref get_nid_for_pfn(unsigned long pfn)
 {
-	if (!pfn_valid_within(pfn))
-		return -1;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 	if (system_state < SYSTEM_RUNNING)
 		return early_pfn_to_nid(pfn);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fcb535560028..ee3a86830519 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1525,18 +1525,6 @@ void sparse_init(void);
 #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
-/*
- * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
- * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
- * pfn_valid_within() should be used in this case; we optimise this away
- * when we have no holes within a MAX_ORDER_NR_PAGES block.
- */
-#ifdef CONFIG_HOLES_IN_ZONE
-#define pfn_valid_within(pfn) pfn_valid(pfn)
-#else
-#define pfn_valid_within(pfn) (1)
-#endif
-
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 40a9bfcd5062..14d5d2837737 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -96,9 +96,6 @@ config HAVE_FAST_GUP
 	depends on MMU
 	bool
 
-config HOLES_IN_ZONE
-	bool
-
 # Don't discard allocated memory used to track "memory" and "reserved" memblocks
 # after early boot, so it can still be used to test for validity of memory.
 # Also, memblocks are updated with memory hot(un)plug.
diff --git a/mm/compaction.c b/mm/compaction.c
index 621508e0ecd5..ed37e1cb4369 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -306,16 +306,14 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
 	 * is necessary for the block to be a migration source/target.
 	 */
 	do {
-		if (pfn_valid_within(pfn)) {
-			if (check_source && PageLRU(page)) {
-				clear_pageblock_skip(page);
-				return true;
-			}
+		if (check_source && PageLRU(page)) {
+			clear_pageblock_skip(page);
+			return true;
+		}
 
-			if (check_target && PageBuddy(page)) {
-				clear_pageblock_skip(page);
-				return true;
-			}
+		if (check_target && PageBuddy(page)) {
+			clear_pageblock_skip(page);
+			return true;
 		}
 
 		page += (1 << PAGE_ALLOC_COSTLY_ORDER);
@@ -585,8 +583,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 			break;
 
 		nr_scanned++;
-		if (!pfn_valid_within(blockpfn))
-			goto isolate_fail;
 
 		/*
 		 * For compound pages such as THP and hugetlbfs, we can save
@@ -885,8 +881,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			cond_resched();
 		}
 
-		if (!pfn_valid_within(low_pfn))
-			goto isolate_fail;
 		nr_scanned++;
 
 		page = pfn_to_page(low_pfn);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 86c3af79e874..8d3376f66f01 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1308,10 +1308,6 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 		for (; pfn < sec_end_pfn && pfn < end_pfn;
 		     pfn += MAX_ORDER_NR_PAGES) {
 			i = 0;
-			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
-			while ((i < MAX_ORDER_NR_PAGES) &&
-				!pfn_valid_within(pfn + i))
-				i++;
 			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
 				continue;
 			/* Check if we got outside of the zone */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eeb3a9cb36bb..79a2fc5b6c6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -594,8 +594,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
-	if (!pfn_valid_within(page_to_pfn(page)))
-		return 0;
 	if (zone != page_zone(page))
 		return 0;
 
@@ -1025,16 +1023,12 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 	if (order >= MAX_ORDER - 2)
 		return false;
 
-	if (!pfn_valid_within(buddy_pfn))
-		return false;
-
 	combined_pfn = buddy_pfn & pfn;
 	higher_page = page + (combined_pfn - pfn);
 	buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 	higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 
-	return pfn_valid_within(buddy_pfn) &&
-	       page_is_buddy(higher_page, higher_buddy, order + 1);
+	return page_is_buddy(higher_page, higher_buddy, order + 1);
 }
 
 /*
@@ -1095,8 +1089,6 @@ continue_merging:
 		buddy_pfn = __find_buddy_pfn(pfn, order);
 		buddy = page + (buddy_pfn - pfn);
 
-		if (!pfn_valid_within(buddy_pfn))
-			goto done_merging;
 		if (!page_is_buddy(page, buddy, order))
 			goto done_merging;
 		/*
@@ -1754,9 +1746,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
 /*
  * Check that the whole (or subset of) a pageblock given by the interval of
  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
+ * with the migration of free compaction scanner.
  *
  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
  *
@@ -1872,8 +1862,6 @@ static inline void __init pgdat_init_report_one_done(void)
  */
 static inline bool __init deferred_pfn_valid(unsigned long pfn)
 {
-	if (!pfn_valid_within(pfn))
-		return false;
 	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
 		return false;
 	return true;
@@ -2520,11 +2508,6 @@ static int move_freepages(struct zone *zone,
 	int pages_moved = 0;
 
 	for (pfn = start_pfn; pfn <= end_pfn;) {
-		if (!pfn_valid_within(pfn)) {
-			pfn++;
-			continue;
-		}
-
 		page = pfn_to_page(pfn);
 		if (!PageBuddy(page)) {
 			/*
@@ -8814,9 +8797,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 	}
 
 	for (; iter < pageblock_nr_pages - offset; iter++) {
-		if (!pfn_valid_within(pfn + iter))
-			continue;
-
 		page = pfn_to_page(pfn + iter);
 
 		/*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bddf788f45bf..471e3a13b541 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -93,8 +93,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 			buddy_pfn = __find_buddy_pfn(pfn, order);
 			buddy = page + (buddy_pfn - pfn);
 
-			if (pfn_valid_within(buddy_pfn) &&
-			    !is_migrate_isolate_page(buddy)) {
+			if (!is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
 				isolated_page = true;
 			}
@@ -250,10 +249,6 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
 	struct page *page;
 
 	while (pfn < end_pfn) {
-		if (!pfn_valid_within(pfn)) {
-			pfn++;
-			continue;
-		}
 		page = pfn_to_page(pfn);
 		if (PageBuddy(page))
 			/*
diff --git a/mm/page_owner.c b/mm/page_owner.c
index f51a57e92aa3..62402d22539b 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -276,9 +276,6 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 		pageblock_mt = get_pageblock_migratetype(page);
 
 		for (; pfn < block_end_pfn; pfn++) {
-			if (!pfn_valid_within(pfn))
-				continue;
-
 			/* The pageblock is online, no need to recheck. */
 			page = pfn_to_page(pfn);
 
@@ -479,10 +476,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 			continue;
 		}
 
-		/* Check for holes within a MAX_ORDER area */
-		if (!pfn_valid_within(pfn))
-			continue;
-
 		page = pfn_to_page(pfn);
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = buddy_order_unsafe(page);
@@ -560,14 +553,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
 		for (; pfn < block_end_pfn; pfn++) {
-			struct page *page;
+			struct page *page = pfn_to_page(pfn);
 			struct page_ext *page_ext;
 
-			if (!pfn_valid_within(pfn))
-				continue;
-
-			page = pfn_to_page(pfn);
-
 			if (page_zone(page) != zone)
 				continue;
 
-- 
cgit v1.2.3


From 4b0970024408afb17886e0c76e9761c4264db2a8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:19 -0700
Subject: mm: track present early pages per zone

Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3.

I. Goal

The goal of this series is improving in-kernel auto-online support.  It
tackles the fundamental problems that:

 1) We can create zone imbalances when onlining all memory blindly to
    ZONE_MOVABLE, in the worst case crashing the system. We have to know
    upfront how much memory we are going to hotplug such that we can
    safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE
    via "online_movable". This is far from practical and only applicable in
    limited setups -- like inside VMs under the RHV/oVirt hypervisor which
    will never hotplug more than 3 times the boot memory (and the
    limitation is only in place due to the Linux limitation).

 2) We see more setups that implement dynamic VM resizing, hot(un)plugging
    memory to resize VM memory. In these setups, we might hotplug a lot of
    memory, but it might happen in various small steps in both directions
    (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the
    primary driver of this upstream right now, performing such dynamic
    resizing NUMA-aware via multiple virtio-mem devices.

    Onlining all hotplugged memory to ZONE_NORMAL means we basically have
    no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can
    easily run into zone imbalances when growing a VM. We want a mixture,
    and we want as much memory as reasonable/configured in ZONE_MOVABLE.
    Details regarding zone imbalances can be found at [1].

 3) Memory devices consist of 1..X memory block devices, however, the
    kernel doesn't really track the relationship. Consequently, also user
    space has no idea. We want to make per-device decisions.

    As one example, for memory hotunplug it doesn't make sense to use a
    mixture of zones within a single DIMM: we want all MOVABLE if
    possible, otherwise all !MOVABLE, because any !MOVABLE part will easily
    block the whole DIMM from getting hotunplugged.

    As another example, virtio-mem operates on individual units that span
    1..X memory blocks. Similar to a DIMM, we want a unit to either be all
    MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however,
    all units of a virtio-mem device logically belong together and are
    managed (added/removed) by a single driver. We want as much memory of
    a virtio-mem device to be MOVABLE as possible.

 4) We want memory onlining to be done right from the kernel while adding
    memory, not triggered by user space via udev rules; for example, this
    is reqired for fast memory hotplug for drivers that add individual
    memory blocks, like virito-mem. We want a way to configure a policy in
    the kernel and avoid implementing advanced policies in user space.

The auto-onlining support we have in the kernel is not sufficient.  All we
have is a) online everything MOVABLE (online_movable) b) online everything
!MOVABLE (online_kernel) c) keep zones contiguous (online).  This series
allows configuring c) to mean instead "online movable if possible
according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio"
-- a new onlining policy.

II. Approach

This series does 3 things:

 1) Introduces the "auto-movable" online policy that initially operates on
    individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio
    to make a decision whether a memory block will be onlined to
    ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL
    memory does not allow for more MOVABLE memory (details in the
    patches). CMA memory is treated like MOVABLE memory.

 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory
    groups and uses group information to make decisions in the
    "auto-movable" online policy across memory blocks of a single memory
    device (modeled as memory group). More details can be found in patch
    #3 or in the DIMM example below.

 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by
    allowing ZONE_NORMAL memory within a dynamic memory group to allow for
    more ZONE_MOVABLE memory within the same memory group. The target use
    case is dynamic VM resizing using virtio-mem. See the virtio-mem
    example below.

I remember that the basic idea of using a ratio to implement a policy in
the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I
lost the pointer to that discussion).

For me, the main use case is using it along with virtio-mem (and DIMMs /
ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the
amount of memory we can hotunplug reliably again if we might eventually
hotplug a lot of memory to a VM.

III. Target Usage

The target usage will be:

 1) Linux boots with "mhp_default_online_type=offline"

 2) User space (e.g., systemd unit) configures memory onlining (according
    to a config file and system properties), for example:
    * Setting memory_hotplug.online_policy=auto-movable
    * Setting memory_hotplug.auto_movable_ratio=301
    * Setting memory_hotplug.auto_movable_numa_aware=true

 3) User space enabled auto onlining via "echo online >
    /sys/devices/system/memory/auto_online_blocks"

 4) User space triggers manual onlining of all already-offline memory
    blocks (go over offline memory blocks and set them to "online")

IV. Example

For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of
301% results in the following layout:
	Memory block 0-15:    DMA32   (early)
	Memory block 32-47:   Normal  (early)
	Memory block 48-79:   Movable (DIMM 0)
	Memory block 80-111:  Movable (DIMM 1)
	Memory block 112-143: Movable (DIMM 2)
	Memory block 144-275: Normal  (DIMM 3)
	Memory block 176-207: Normal  (DIMM 4)
	... all Normal
	(-> hotplugged Normal memory does not allow for more Movable memory)

For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM
will result in the following layout:
	Memory block 0-15:    DMA32   (early)
	Memory block 32-47:   Normal  (early)
	Memory block 48-143:  Movable (virtio-mem, first 12 GiB)
	Memory block 144:     Normal  (virtio-mem, next 128 MiB)
	Memory block 145-147: Movable (virtio-mem, next 384 MiB)
	Memory block 148:     Normal  (virtio-mem, next 128 MiB)
	Memory block 149-151: Movable (virtio-mem, next 384 MiB)
	... Normal/Movable mixture as above
	(-> hotplugged Normal memory allows for more Movable memory within
	    the same device)

Which gives us maximum flexibility when dynamically growing/shrinking a
VM in smaller steps.

V. Doc Update

I'll update the memory-hotplug.rst documentation, once the overhaul [1] is
usptream. Until then, details can be found in patch #2.

VI. Future Work

 1) Use memory groups for ppc64 dlpar
 2) Being able to specify a portion of (early) kernel memory that will be
    excluded from the ratio. Like "128 MiB globally/per node" are excluded.

    This might be helpful when starting VMs with extremely small memory
    footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting
    the first hotplugged units getting onlined to ZONE_MOVABLE. One
    alternative would be a trigger to not consider ZONE_DMA memory
    in the ratio. We'll have to see if this is really rrequired.
 3) Indicate to user space that MOVABLE might be a bad idea -- especially
    relevant when memory ballooning without support for balloon compaction
    is active.

This patch (of 9):

For implementing a new memory onlining policy, which determines when to
online memory blocks to ZONE_MOVABLE semi-automatically, we need the
number of present early (boot) pages -- present pages excluding hotplugged
pages.  Let's track these pages per zone.

Pass a page instead of the zone to adjust_present_page_count(), similar as
adjust_managed_page_count() and derive the zone from the page.

It's worth noting that a memory block to be offlined/onlined is either
completely "early" or "not early".  add_memory() and friends can only add
complete memory blocks and we only online/offline complete (individual)
memory blocks.

Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com
Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 14 +++++++-------
 include/linux/memory_hotplug.h |  2 +-
 include/linux/mmzone.h         |  7 +++++++
 mm/memory_hotplug.c            | 14 +++++++++++---
 mm/page_alloc.c                |  3 +++
 5 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index aa31a21f33d7..86ec2dc82fc2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -205,7 +205,8 @@ static int memory_block_online(struct memory_block *mem)
 	 * now already properly populated.
 	 */
 	if (nr_vmemmap_pages)
-		adjust_present_page_count(zone, nr_vmemmap_pages);
+		adjust_present_page_count(pfn_to_page(start_pfn),
+					  nr_vmemmap_pages);
 
 	return ret;
 }
@@ -215,24 +216,23 @@ static int memory_block_offline(struct memory_block *mem)
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
-	struct zone *zone;
 	int ret;
 
 	/*
 	 * Unaccount before offlining, such that unpopulated zone and kthreads
 	 * can properly be torn down in offline_pages().
 	 */
-	if (nr_vmemmap_pages) {
-		zone = page_zone(pfn_to_page(start_pfn));
-		adjust_present_page_count(zone, -nr_vmemmap_pages);
-	}
+	if (nr_vmemmap_pages)
+		adjust_present_page_count(pfn_to_page(start_pfn),
+					  -nr_vmemmap_pages);
 
 	ret = offline_pages(start_pfn + nr_vmemmap_pages,
 			    nr_pages - nr_vmemmap_pages);
 	if (ret) {
 		/* offline_pages() failed. Account back. */
 		if (nr_vmemmap_pages)
-			adjust_present_page_count(zone, nr_vmemmap_pages);
+			adjust_present_page_count(pfn_to_page(start_pfn),
+						  nr_vmemmap_pages);
 		return ret;
 	}
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 068e3dcf4690..39b04e99a30e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
-extern void adjust_present_page_count(struct zone *zone, long nr_pages);
+extern void adjust_present_page_count(struct page *page, long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 				     struct zone *zone);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee3a86830519..1c0e3bf42521 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -540,6 +540,10 @@ struct zone {
 	 * is calculated as:
 	 *	present_pages = spanned_pages - absent_pages(pages in holes);
 	 *
+	 * present_early_pages is present pages existing within the zone
+	 * located on memory available since early boot, excluding hotplugged
+	 * memory.
+	 *
 	 * managed_pages is present pages managed by the buddy system, which
 	 * is calculated as (reserved_pages includes pages allocated by the
 	 * bootmem allocator):
@@ -572,6 +576,9 @@ struct zone {
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+	unsigned long		present_early_pages;
+#endif
 #ifdef CONFIG_CMA
 	unsigned long		cma_pages;
 #endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6ea62efe2a8f..8a99fa6d096c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -724,8 +724,16 @@ struct zone *zone_for_pfn_range(int online_type, int nid,
  * This function should only be called by memory_block_{online,offline},
  * and {online,offline}_pages.
  */
-void adjust_present_page_count(struct zone *zone, long nr_pages)
+void adjust_present_page_count(struct page *page, long nr_pages)
 {
+	struct zone *zone = page_zone(page);
+
+	/*
+	 * We only support onlining/offlining/adding/removing of complete
+	 * memory blocks; therefore, either all is either early or hotplugged.
+	 */
+	if (early_section(__pfn_to_section(page_to_pfn(page))))
+		zone->present_early_pages += nr_pages;
 	zone->present_pages += nr_pages;
 	zone->zone_pgdat->node_present_pages += nr_pages;
 }
@@ -826,7 +834,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z
 	}
 
 	online_pages_range(pfn, nr_pages);
-	adjust_present_page_count(zone, nr_pages);
+	adjust_present_page_count(pfn_to_page(pfn), nr_pages);
 
 	node_states_set_node(nid, &arg);
 	if (need_zonelists_rebuild)
@@ -1697,7 +1705,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 	/* removal success */
 	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
-	adjust_present_page_count(zone, -nr_pages);
+	adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages);
 
 	/* reinitialise watermarks and update pcp limits */
 	init_per_zone_wmark_min();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79a2fc5b6c6f..9353418892a7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7240,6 +7240,9 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
 			zone->zone_start_pfn = 0;
 		zone->spanned_pages = size;
 		zone->present_pages = real_size;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+		zone->present_early_pages = real_size;
+#endif
 
 		totalpages += size;
 		realtotalpages += real_size;
-- 
cgit v1.2.3


From 053cfda102306a3394012f9fe2594811c34925e4 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 8 Sep 2021 18:10:11 -0700
Subject: mm/page_alloc.c: avoid accessing uninitialized pcp page migratetype

If it's not prepared to free unref page, the pcp page migratetype is
unset.  Thus we will get rubbish from get_pcppage_migratetype() and
might list_del(&page->lru) again after it's already deleted from the list
leading to grumble about data corruption.

Link: https://lkml.kernel.org/r/20210902115447.57050-1-linmiaohe@huawei.com
Fixes: df1acc856923 ("mm/page_alloc: avoid conflating IRQs disabled with zone->lock")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de309a1dfe65..b37435c274cf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3428,8 +3428,10 @@ void free_unref_page_list(struct list_head *list)
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		pfn = page_to_pfn(page);
-		if (!free_unref_page_prepare(page, pfn, 0))
+		if (!free_unref_page_prepare(page, pfn, 0)) {
 			list_del(&page->lru);
+			continue;
+		}
 
 		/*
 		 * Free isolated pages directly to the allocator, see
-- 
cgit v1.2.3