diff options
author | Tony Luck <tony.luck@intel.com> | 2015-06-25 02:58:09 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 03:49:44 +0300 |
commit | fc6daaf93151877748f8096af6b3fddb147f22d6 (patch) | |
tree | 1892f34cca08d40af6598bccae87c42037c5ea80 /arch | |
parent | 6afdb859b71019143b8eecda02b8b29b03185055 (diff) | |
download | linux-fc6daaf93151877748f8096af6b3fddb147f22d6.tar.xz |
mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute
Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check. Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).
But we have no recovery path for errors encountered during kernel code
execution. Except for some very specific cases were are unlikely to ever
be able to recover.
Enter memory mirroring. Actually 3rd generation of memory mirroing.
Gen1: All memory is mirrored
Pro: No s/w enabling - h/w just gets good data from other side of the
mirror
Con: Halves effective memory capacity available to OS/applications
Gen2: Partial memory mirror - just mirror memory begind some memory controllers
Pro: Keep more of the capacity
Con: Nightmare to enable. Have to choose between allocating from
mirrored memory for safety vs. NUMA local memory for performance
Gen3: Address range partial memory mirror - some mirror on each memory
controller
Pro: Can tune the amount of mirror and keep NUMA performance
Con: I have to write memory management code to implement
The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:
1) This patch series - find the mirrored memory, use it for boot time
allocations
2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
unused mirrored memory from mm/memblock.c and only give it out to
select kernel allocations (this is still being scoped because
page_alloc.c is scary).
This patch (of 3):
Add extra "flags" to memblock to allow selection of memory based on
attribute. No functional changes
Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/s390/kernel/crash_dump.c | 5 | ||||
-rw-r--r-- | arch/sparc/mm/init_64.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/check.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/e820.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 2 |
5 files changed, 12 insertions, 7 deletions
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d9f0dcfcae5e..7a75ad4594e3 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = { }; #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \ - for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \ + for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE, \ + &memblock.physmem, \ &oldmem_type, p_start, \ p_end, p_nid); \ i != (u64)ULLONG_MAX; \ - __next_mem_range(&i, nid, &memblock.physmem, \ + __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\ &oldmem_type, \ p_start, p_end, p_nid)) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index c5d08b89a96c..4ac88b757514 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void) phys_addr_t pa_start, pa_end; u64 i; - for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, + &pa_end, NULL) available = available + (pa_end - pa_start); return available; @@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram) if (limit_ram >= avail_ram) return; - for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start, + &pa_end, NULL) { phys_addr_t region_size = pa_end - pa_start; phys_addr_t clip_start = pa_start; diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 83a7995625a6..58118e207a69 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), PAGE_SIZE, corruption_check_size); end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e2ce85db2283..c8dda42cb6a3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void) nr_pages += end_pfn - start_pfn; } - for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); if (start_pfn < end_pfn) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c8140e12816a..8340e45c891a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid, phys_addr_t start, end; u64 i; - for_each_free_mem_range(i, nid, &start, &end, NULL) { + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), start_pfn, end_pfn); unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), |