From 2c91f8fc6c999fe10185d8ad99fda1759f662f70 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Nov 2019 17:34:57 -0800 Subject: mm/memory_hotplug: fix try_offline_node() try_offline_node() is pretty much broken right now: - The node span is updated when onlining memory, not when adding it. We ignore memory that was mever onlined. Bad. - We touch possible garbage memmaps. The pfn_to_nid(pfn) can easily trigger a kernel panic. Bad for memory that is offline but also bad for subsection hotadd with ZONE_DEVICE, whereby the memmap of the first PFN of a section might contain garbage. - Sections belonging to mixed nodes are not properly considered. As memory blocks might belong to multiple nodes, we would have to walk all pageblocks (or at least subsections) within present sections. However, we don't have a way to identify whether a memmap that is not online was initialized (relevant for ZONE_DEVICE). This makes things more complicated. Luckily, we can piggy pack on the node span and the nid stored in memory blocks. Currently, the node span is grown when calling move_pfn_range_to_zone() - e.g., when onlining memory, and shrunk when removing memory, before calling try_offline_node(). Sysfs links are created via link_mem_sections(), e.g., during boot or when adding memory. If the node still spans memory or if any memory block belongs to the nid, we don't set the node offline. As memory blocks that span multiple nodes cannot get offlined, the nid stored in memory blocks is reliable enough (for such online memory blocks, the node still spans the memory). Introduce for_each_memory_block() to efficiently walk all memory blocks. Note: We will soon stop shrinking the ZONE_DEVICE zone and the node span when removing ZONE_DEVICE memory to fix similar issues (access of garbage memmaps) - until we have a reliable way to identify whether these memmaps were properly initialized. This implies later, that once a node had ZONE_DEVICE memory, we won't be able to set a node offline - which should be acceptable. Since commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") memory that is added is not assoziated with a zone/node (memmap not initialized). The introducing commit 60a5a19e7419 ("memory-hotplug: remove sysfs file of node") already missed that we could have multiple nodes for a section and that the zone/node span is updated when onlining pages, not when adding them. I tested this by hotplugging two DIMMs to a memory-less and cpu-less NUMA node. The node is properly onlined when adding the DIMMs. When removing the DIMMs, the node is properly offlined. Masayoshi Mizuma reported: : Without this patch, memory hotplug fails as panic: : : BUG: kernel NULL pointer dereference, address: 0000000000000000 : ... : Call Trace: : remove_memory_block_devices+0x81/0xc0 : try_remove_memory+0xb4/0x130 : __remove_memory+0xa/0x20 : acpi_memory_device_remove+0x84/0x100 : acpi_bus_trim+0x57/0x90 : acpi_bus_trim+0x2e/0x90 : acpi_device_hotplug+0x2b2/0x4d0 : acpi_hotplug_work_fn+0x1a/0x30 : process_one_work+0x171/0x380 : worker_thread+0x49/0x3f0 : kthread+0xf8/0x130 : ret_from_fork+0x35/0x40 [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20191102120221.7553-1-david@redhat.com Link: http://lkml.kernel.org/r/20191028105458.28320-1-david@redhat.com Fixes: 60a5a19e7419 ("memory-hotplug: remove sysfs file of node") Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") # visiable after d0dc12e86b319 Signed-off-by: David Hildenbrand Tested-by: Masayoshi Mizuma Cc: Tang Chen Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Keith Busch Cc: Jiri Olsa Cc: "Peter Zijlstra (Intel)" Cc: Jani Nikula Cc: Nayna Jain Cc: Michal Hocko Cc: Oscar Salvador Cc: Stephen Rothwell Cc: Dan Williams Cc: Pavel Tatashin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'drivers/base/memory.c') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 55907c27075b..84c4e1f72cbd 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -872,3 +872,39 @@ int walk_memory_blocks(unsigned long start, unsigned long size, } return ret; } + +struct for_each_memory_block_cb_data { + walk_memory_blocks_func_t func; + void *arg; +}; + +static int for_each_memory_block_cb(struct device *dev, void *data) +{ + struct memory_block *mem = to_memory_block(dev); + struct for_each_memory_block_cb_data *cb_data = data; + + return cb_data->func(mem, cb_data->arg); +} + +/** + * for_each_memory_block - walk through all present memory blocks + * + * @arg: argument passed to func + * @func: callback for each memory block walked + * + * This function walks through all present memory blocks, calling func on + * each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) +{ + struct for_each_memory_block_cb_data cb_data = { + .func = func, + .arg = arg, + }; + + return bus_for_each_dev(&memory_subsys, NULL, &cb_data, + for_each_memory_block_cb); +} -- cgit v1.2.3 From feec24a6139d4640c6ef344e0271a8cd4d509e60 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 30 Nov 2019 17:53:38 -0800 Subject: mm, soft-offline: convert parameter to pfn Currently soft_offline_page() receives struct page, and its sibling memory_failure() receives pfn. This discrepancy looks weird and makes precheck on pfn validity tricky. So let's align them. Link: http://lkml.kernel.org/r/20191016234706.GA5493@www9186uo.sakura.ne.jp Signed-off-by: Naoya Horiguchi Acked-by: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 7 +------ include/linux/mm.h | 2 +- mm/madvise.c | 2 +- mm/memory-failure.c | 19 +++++++++---------- 4 files changed, 12 insertions(+), 18 deletions(-) (limited to 'drivers/base/memory.c') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 84c4e1f72cbd..d65ecdeb83e8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -538,12 +538,7 @@ static ssize_t soft_offline_page_store(struct device *dev, if (kstrtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - if (!pfn_valid(pfn)) - return -ENXIO; - /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ - if (!pfn_to_online_page(pfn)) - return -EIO; - ret = soft_offline_page(pfn_to_page(pfn), 0); + ret = soft_offline_page(pfn, 0); return ret == 0 ? count : ret; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 06b51d8728ec..19a0e687878a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2773,7 +2773,7 @@ extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; -extern int soft_offline_page(struct page *page, int flags); +extern int soft_offline_page(unsigned long pfn, int flags); /* diff --git a/mm/madvise.c b/mm/madvise.c index 94c343b4c968..63e130800570 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -895,7 +895,7 @@ static int madvise_inject_error(int behavior, pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = soft_offline_page(page, MF_COUNT_INCREASED); + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); if (ret) return ret; continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 05c8c6df25e6..af2712004a4d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1476,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work) if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) - soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + soft_offline_page(entry.pfn, entry.flags); else memory_failure(entry.pfn, entry.flags); } @@ -1857,7 +1857,7 @@ static int soft_offline_free_page(struct page *page) /** * soft_offline_page - Soft offline a page. - * @page: page to offline + * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * * Returns 0 on success, otherwise negated errno. @@ -1877,18 +1877,17 @@ static int soft_offline_free_page(struct page *page) * This is not a 100% solution for all memory, but tries to be * ``good enough'' for the majority of memory. */ -int soft_offline_page(struct page *page, int flags) +int soft_offline_page(unsigned long pfn, int flags) { int ret; - unsigned long pfn = page_to_pfn(page); + struct page *page; - if (is_zone_device_page(page)) { - pr_debug_ratelimited("soft_offline: %#lx page is device page\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + if (!pfn_valid(pfn)) + return -ENXIO; + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ + page = pfn_to_online_page(pfn); + if (!page) return -EIO; - } if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); -- cgit v1.2.3 From 848e19ad3c3352b6e0906f05b282a3e22c67c98f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:14 -0800 Subject: drivers/base/memory.c: drop the mem_sysfs_mutex The mem_sysfs_mutex isn't really helpful. Also, it's not really clear what the mutex protects at all. The device lists of the memory subsystem are protected separately. We don't need that mutex when looking up. creating, or removing independent devices. find_memory_block_by_id() will perform locking on its own and grab a reference of the returned device. At the time memory_dev_init() is called, we cannot have concurrent hot(un)plug operations yet - we're still fairly early during boot. We don't need any locking. The creation/removal of memory block devices should be protected on a higher level - especially using the device hotplug lock to avoid documented issues (see Documentation/core-api/memory-hotplug.rst) - or if that is reworked, using similar locking. Protecting in the context of these functions only doesn't really make sense. Especially, if we would have a situation where the same memory blocks are created/deleted at the same time, there is something horribly going wrong (imagining adding/removing a DIMM at the same time from two call paths) - after the functions succeeded something else in the callers would blow up (e.g., create_memory_block_devices() succeeded but there are no memory block devices anymore). All relevant call paths (except when adding memory early during boot via ACPI, which is now documented) hold the device hotplug lock when adding memory, and when removing memory. Let's document that instead. Add a simple safety net to create_memory_block_devices() in case we would actually remove memory blocks while adding them, so we'll never dereference a NULL pointer. Simplify memory_dev_init() now that the lock is gone. Link: http://lkml.kernel.org/r/20190925082621.4927-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'drivers/base/memory.c') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index d65ecdeb83e8..799b43191dea 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -19,15 +19,12 @@ #include #include #include -#include #include #include #include #include -static DEFINE_MUTEX(mem_sysfs_mutex); - #define MEMORY_CLASS_NAME "memory" #define to_memory_block(dev) container_of(dev, struct memory_block, dev) @@ -700,6 +697,8 @@ static void unregister_memory(struct memory_block *memory) * Create memory block devices for the given memory area. Start and size * have to be aligned to memory block granularity. Memory block devices * will be initialized as offline. + * + * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size) { @@ -713,7 +712,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) !IS_ALIGNED(size, memory_block_size_bytes()))) return -EINVAL; - mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) @@ -725,11 +723,12 @@ int create_memory_block_devices(unsigned long start, unsigned long size) for (block_id = start_block_id; block_id != end_block_id; block_id++) { mem = find_memory_block_by_id(block_id); + if (WARN_ON_ONCE(!mem)) + continue; mem->section_count = 0; unregister_memory(mem); } } - mutex_unlock(&mem_sysfs_mutex); return ret; } @@ -737,6 +736,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size) * Remove memory block devices for the given memory area. Start and size * have to be aligned to memory block granularity. Memory block devices * have to be offline. + * + * Called under device_hotplug_lock. */ void remove_memory_block_devices(unsigned long start, unsigned long size) { @@ -749,7 +750,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) !IS_ALIGNED(size, memory_block_size_bytes()))) return; - mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) @@ -758,7 +758,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) unregister_memory_block_under_nodes(mem); unregister_memory(mem); } - mutex_unlock(&mem_sysfs_mutex); } /* return true if the memory block is offlined, otherwise, return false */ @@ -792,12 +791,13 @@ static const struct attribute_group *memory_root_attr_groups[] = { }; /* - * Initialize the sysfs support for memory devices... + * Initialize the sysfs support for memory devices. At the time this function + * is called, we cannot have concurrent creation/deletion of memory block + * devices, the device_hotplug_lock is not needed. */ void __init memory_dev_init(void) { int ret; - int err; unsigned long block_sz, nr; /* Validate the configured memory block size */ @@ -808,24 +808,19 @@ void __init memory_dev_init(void) ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) - goto out; + panic("%s() failed to register subsystem: %d\n", __func__, ret); /* * Create entries for memory sections that were found * during boot and have been initialized */ - mutex_lock(&mem_sysfs_mutex); for (nr = 0; nr <= __highest_present_section_nr; nr += sections_per_block) { - err = add_memory_block(nr); - if (!ret) - ret = err; + ret = add_memory_block(nr); + if (ret) + panic("%s() failed to add memory block: %d\n", __func__, + ret); } - mutex_unlock(&mem_sysfs_mutex); - -out: - if (ret) - panic("%s() failed: %d\n", __func__, ret); } /** -- cgit v1.2.3