summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/memory_hotplug.c161
-rw-r--r--mm/sparse.c2
3 files changed, 157 insertions, 11 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fe4897c3c81b..02d44e3420f5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -188,6 +188,11 @@ config MEMORY_HOTREMOVE
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
+config MHP_MEMMAP_ON_MEMORY
+ def_bool y
+ depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+ depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 04f01fabc150..0b3157836814 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,8 @@
#include "internal.h"
#include "shuffle.h"
+static bool memmap_on_memory;
+
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +650,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* decide to not expose all pages to the buddy (e.g., expose them
* later). We account all pages as being online and belonging to this
* zone ("present").
+ * When using memmap_on_memory, the range might not be aligned to
+ * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+ * this and the first chunk to online will be pageblock_nr_pages.
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
- (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+ for (pfn = start_pfn; pfn < end_pfn;) {
+ int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ pfn += (1UL << order);
+ }
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
@@ -829,7 +838,11 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-static void adjust_present_page_count(struct zone *zone, long nr_pages)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
{
unsigned long flags;
@@ -839,12 +852,54 @@ static void adjust_present_page_count(struct zone *zone, long nr_pages)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid)
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+ int ret;
+
+ ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+ if (ret)
+ return ret;
+
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections online here as otherwise they will be
+ * left offline.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections offline here as otherwise they will be
+ * left online.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ /*
+ * The pages associated with this vmemmap have been offlined, so
+ * we can reset its state here.
+ */
+ remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+ kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
{
unsigned long flags;
- struct zone *zone;
int need_zonelists_rebuild = 0;
+ const int nid = zone_to_nid(zone);
int ret;
struct memory_notify arg;
@@ -863,7 +918,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
mem_hotplug_begin();
/* associate pfn range with the zone */
- zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
@@ -1077,6 +1131,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+ unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+ unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+ unsigned long remaining_size = size - vmemmap_size;
+
+ /*
+ * Besides having arch support and the feature enabled at runtime, we
+ * need a few more assumptions to hold true:
+ *
+ * a) We span a single memory block: memory onlining/offlinin;g happens
+ * in memory block granularity. We don't want the vmemmap of online
+ * memory blocks to reside on offline memory blocks. In the future,
+ * we might want to support variable-sized memory blocks to make the
+ * feature more versatile.
+ *
+ * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * to populate memory from the altmap for unrelated parts (i.e.,
+ * other memory blocks)
+ *
+ * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * the buddy) have to cover full pageblocks: memory onlining/offlining
+ * code requires applicable ranges to be page-aligned, for example, to
+ * set the migratetypes properly.
+ *
+ * TODO: Although we have a check here to make sure that vmemmap pages
+ * fully populate a PMD, it is not the right place to check for
+ * this. A much better solution involves improving vmemmap code
+ * to fallback to base pages when trying to populate vmemmap using
+ * altmap as an alternative source of memory, and we do not exactly
+ * populate a single PMD.
+ */
+ return memmap_on_memory &&
+ IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+ size == memory_block_size_bytes() &&
+ IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+ IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1086,6 +1179,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ struct vmem_altmap mhp_altmap = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1112,13 +1206,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
new_node = ret;
+ /*
+ * Self hosted memmap array
+ */
+ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+ if (!mhp_supports_memmap_on_memory(size)) {
+ ret = -EINVAL;
+ goto error;
+ }
+ mhp_altmap.free = PHYS_PFN(size);
+ mhp_altmap.base_pfn = PHYS_PFN(start);
+ params.altmap = &mhp_altmap;
+ }
+
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size);
+ ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
if (ret) {
arch_remove_memory(nid, start, size, NULL);
goto error;
@@ -1767,6 +1874,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+ /*
+ * If not set, continue with the next block.
+ */
+ return mem->nr_vmemmap_pages;
+}
+
static int check_cpu_on_node(pg_data_t *pgdat)
{
int cpu;
@@ -1841,6 +1956,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ unsigned long nr_vmemmap_pages;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1853,6 +1971,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
if (rc)
return rc;
+ /*
+ * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+ * the same granularity it was added - a single memory block.
+ */
+ if (memmap_on_memory) {
+ nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+ get_nr_vmemmap_pages_cb);
+ if (nr_vmemmap_pages) {
+ if (size != memory_block_size_bytes()) {
+ pr_warn("Refuse to remove %#llx - %#llx,"
+ "wrong granularity\n",
+ start, start + size);
+ return -EINVAL;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table do the
+ * right thing if we used vmem_altmap when hot-adding
+ * the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+ }
+ }
+
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1864,7 +2007,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(nid, start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
diff --git a/mm/sparse.c b/mm/sparse.c
index 33406ea2ecc4..d3fbed26e64e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* Mark all memory sections within the pfn range as offline */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
ms->section_mem_map &= ~SECTION_IS_ONLINE;
}
}
-#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static struct page * __meminit populate_section_memmap(unsigned long pfn,