From 5033091de814ab4b5623faed2755f3064e19e2d2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:12 +0900 Subject: mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory block is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlining memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So the new counter can distinguish these cases. Link: https://lkml.kernel.org/r/20221024062012.1520887-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) (limited to 'mm/memory-failure.c') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757a46e172de..9b82402ec242 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -77,11 +77,14 @@ static bool hw_memory_failure __read_mostly = false; inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); } -static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); } /* @@ -1706,6 +1709,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) if (move_flag) SetPageHWPoison(p->page); + else + num_poisoned_pages_sub(page_to_pfn(p->page), 1); kfree(p); count++; } @@ -2332,6 +2337,7 @@ int unpoison_memory(unsigned long pfn) int ret = -EBUSY; int freeit = 0; unsigned long count = 1; + bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2380,6 +2386,7 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2395,6 +2402,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } else { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2414,7 +2422,8 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(pfn, count); + if (!huge) + num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2609,26 +2618,3 @@ retry: return ret; } - -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i, total = 0; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - total++; - ClearPageHWPoison(&memmap[i]); - } - } - if (total) - num_poisoned_pages_sub(0, total); -} -- cgit v1.2.3