diff options
author | Michal Marek <mmarek@suse.cz> | 2011-03-09 18:15:44 +0300 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2011-03-09 18:15:44 +0300 |
commit | 2d8ad8719591fa803b0d589ed057fa46f49b7155 (patch) | |
tree | 4ae051577dad1161c91dafbf4207bb10a9dc91bb /mm/memory-failure.c | |
parent | 9b4ce7bce5f30712fd926ab4599a803314a07719 (diff) | |
parent | c56eb8fb6dccb83d9fe62fd4dc00c834de9bc470 (diff) | |
download | linux-2d8ad8719591fa803b0d589ed057fa46f49b7155.tar.xz |
Merge commit 'v2.6.38-rc1' into kbuild/packaging
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 347 |
1 files changed, 262 insertions, 85 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17299fd4577c..548fbd70f026 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -44,6 +48,10 @@ #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/suspend.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> +#include <linux/memory_hotplug.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -75,7 +83,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; @@ -180,7 +188,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter); * signal. */ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) + unsigned long pfn, struct page *page) { struct siginfo si; int ret; @@ -195,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -232,7 +240,7 @@ void shake_page(struct page *p, int access) int nr; do { nr = shrink_slab(1000, GFP_KERNEL, 1000); - if (page_count(p) == 0) + if (page_count(p) == 1) break; } while (nr > 10); } @@ -265,7 +273,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -306,7 +314,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -324,7 +332,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) + int fail, struct page *page, unsigned long pfn) { struct to_kill *tk, *next; @@ -349,7 +357,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * process anyways. */ else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + pfn, page) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -378,14 +386,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; + if (!PageHuge(page) && unlikely(split_huge_page(page))) + return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ goto out; for_each_process (tsk) { + struct anon_vma_chain *vmac; + if (!task_early_kill(tsk)) continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { + list_for_each_entry(vmac, &av->head, same_anon_vma) { + vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == tsk->mm) @@ -571,7 +584,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -685,17 +698,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) /* * Huge pages. Needs work. * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. */ static int me_huge_page(struct page *p, unsigned long pfn) { - return FAILED; + int res = 0; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; + } + return DELAYED; } /* @@ -818,8 +843,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -831,8 +854,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; + struct page *hpage = compound_head(p); if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -841,10 +864,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(p)) + if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageCompound(p) || PageKsm(p)) + if (PageKsm(p)) return SWAP_FAIL; if (PageSwapCache(p)) { @@ -859,10 +882,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(p); - if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { - if (page_mkclean(p)) { - SetPageDirty(p); + mapping = page_mapping(hpage); + if (!PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; @@ -881,22 +905,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(p, &tokill); - - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(p, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } + collect_procs(hpage, &tokill); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -907,17 +921,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); + kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + ret != SWAP_SUCCESS, p, pfn); return ret; } +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_trans_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_trans_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + int __memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; + struct page *hpage; int res; + unsigned int nr_pages; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -930,18 +962,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); + hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } - atomic_long_add(1, &mce_bad_pages); + nr_pages = 1 << compound_trans_order(hpage); + atomic_long_add(nr_pages, &mce_bad_pages); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -949,10 +986,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(compound_head(p))) { + !get_page_unless_zero(hpage)) { if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -967,9 +1022,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p)) + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); - if (!PageLRU(p)) { + if (!PageLRU(p) && !PageHuge(p)) { /* * shake_page could have turned it free. */ @@ -987,7 +1042,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(p); + lock_page_nosync(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -999,12 +1054,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); - unlock_page(p); - put_page(p); + atomic_long_sub(nr_pages, &mce_bad_pages); + unlock_page(hpage); + put_page(hpage); return 0; } + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, "hugepage already hardware poisoned", + IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + wait_on_page_writeback(p); /* @@ -1034,7 +1109,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) } } out: - unlock_page(p); + unlock_page(hpage); return res; } EXPORT_SYMBOL_GPL(__memory_failure); @@ -1078,6 +1153,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + unsigned int nr_pages; if (!pfn_valid(pfn)) return -ENXIO; @@ -1086,14 +1162,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } + nr_pages = 1 << compound_trans_order(page); + if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) - atomic_long_dec(&mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + atomic_long_sub(nr_pages, &mce_bad_pages); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1104,10 +1192,12 @@ int unpoison_memory(unsigned long pfn) * the PG_hwpoison page will be caught and isolated on the entrance to * the free buddy page pool. */ - if (TestClearPageHWPoison(p)) { - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_dec(&mce_bad_pages); + if (TestClearPageHWPoison(page)) { + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1122,7 +1212,11 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1139,25 +1233,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_system_sleep prevents a race with memory hotplug, - * because the isolation assumes there's only a single user. + * The lock_memory_hotplug prevents a race with memory hotplug. * This is a big hammer, a better would be nicer. */ - lock_system_sleep(); + lock_memory_hotplug(); /* * Isolate the page, so that it doesn't get reallocated if it * was free. */ set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ if (!get_page_unless_zero(compound_head(p))) { - if (is_free_buddy_page(p)) { - pr_debug("get_any_page: %#lx free buddy page\n", pfn); + if (PageHuge(p)) { + pr_info("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { + pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } @@ -1166,7 +1266,48 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ret = 1; } unset_migratetype_isolate(p); - unlock_system_sleep(); + unlock_memory_hotplug(); + return ret; +} + +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); + if (ret) { + putback_lru_pages(&pagelist); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ return ret; } @@ -1197,6 +1338,9 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; @@ -1223,7 +1367,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } @@ -1237,7 +1381,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); - pr_debug("soft offline: %#lx page already poisoned\n", pfn); + pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1258,7 +1402,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; - pr_debug("soft_offline: %#lx: invalidated\n", pfn); + pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } @@ -1272,15 +1416,16 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) @@ -1292,3 +1437,35 @@ done: /* keep elevated page count for bad page */ return ret; } + +/* + * The caller must hold current->mm->mmap_sem in read mode. + */ +int is_hwpoison_address(unsigned long addr) +{ + pgd_t *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t pte, *ptep; + swp_entry_t entry; + + pgdp = pgd_offset(current->mm, addr); + if (!pgd_present(*pgdp)) + return 0; + pudp = pud_offset(pgdp, addr); + pud = *pudp; + if (!pud_present(pud) || pud_large(pud)) + return 0; + pmdp = pmd_offset(pudp, addr); + pmd = *pmdp; + if (!pmd_present(pmd) || pmd_large(pmd)) + return 0; + ptep = pte_offset_map(pmdp, addr); + pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) + return 0; + entry = pte_to_swp_entry(pte); + return is_hwpoison_entry(entry); +} +EXPORT_SYMBOL_GPL(is_hwpoison_address); |