From 4c29700ed9908c15feeb84a40a415f4e921c5a66 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Sat, 30 Nov 2019 17:54:20 -0800 Subject: mm/sparse: consistently do not zero memmap sparsemem without VMEMMAP has two allocation paths to allocate the memory needed for its memmap (done in sparse_mem_map_populate()). In one allocation path (sparse_buffer_alloc() succeeds), the memory is not zeroed (since it was previously allocated with memblock_alloc_try_nid_raw()). In the other allocation path (sparse_buffer_alloc() fails and sparse_mem_map_populate() falls back to memblock_alloc_try_nid()), the memory is zeroed. AFAICS this difference does not appear to be on purpose. If the code is supposed to work with non-initialized memory (__init_single_page() takes care of zeroing the struct pages which are actually used), we should consistently not zero the memory, to avoid masking bugs. ( I noticed this because on my ARM64 platform, with 1 GiB of memory the first [and only] section is allocated from the zeroing path while with 2 GiB of memory the first 1 GiB section is allocated from the non-zeroing path. ) Michal: "the main user visible problem is a memory wastage. The overal amount of memory should be small. I wouldn't call it stable material." Link: http://lkml.kernel.org/r/20191030131122.8256-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index f6891c1992b1..01e467adc219 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -458,7 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid(size, + map = memblock_alloc_try_nid_raw(size, PAGE_SIZE, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!map) -- cgit v1.2.3 From 030eab4f9ffb469344c10a46bc02c5149db0a2a9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sat, 30 Nov 2019 17:54:24 -0800 Subject: mm/sparse.c: mark populate_section_memmap as __meminit Building the kernel on s390 with -Og produces the following warning: WARNING: vmlinux.o(.text+0x28dabe): Section mismatch in reference from the function populate_section_memmap() to the function .meminit.text:__populate_section_memmap() The function populate_section_memmap() references the function __meminit __populate_section_memmap(). This is often because populate_section_memmap lacks a __meminit annotation or the annotation of __populate_section_memmap is wrong. While -Og is not supported, in theory this might still happen with another compiler or on another architecture. So fix this by using the correct section annotations. [iii@linux.ibm.com: v2] Link: http://lkml.kernel.org/r/20191030151639.41486-1-iii@linux.ibm.com Link: http://lkml.kernel.org/r/20191028165549.14478-1-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Acked-by: David Hildenbrand Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 01e467adc219..163b4d59cf6c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -647,7 +647,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static struct page *populate_section_memmap(unsigned long pfn, +static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { return __populate_section_memmap(pfn, nr_pages, nid, altmap); @@ -669,7 +669,7 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } #else -struct page *populate_section_memmap(unsigned long pfn, +struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; -- cgit v1.2.3 From 09dbcf422e9b791d2d43cad8c283d9bdaef019a9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 30 Nov 2019 17:54:27 -0800 Subject: mm/sparse.c: do not waste pre allocated memmap space Vincent has noticed [1] that there is something unusual with the memmap allocations going on on his platform : I noticed this because on my ARM64 platform, with 1 GiB of memory the : first [and only] section is allocated from the zeroing path while with : 2 GiB of memory the first 1 GiB section is allocated from the : non-zeroing path. The underlying problem is that although sparse_buffer_init allocates enough memory for all sections on the node sparse_buffer_alloc is not able to consume them due to mismatch in the expected allocation alignement. While sparse_buffer_init preallocation uses the PAGE_SIZE alignment the real memmap has to be aligned to section_map_size() this results in a wasted initial chunk of the preallocated memmap and unnecessary fallback allocation for a section. While we are at it also change __populate_section_memmap to align to the requested size because at least VMEMMAP has constrains to have memmap properly aligned. [1] http://lkml.kernel.org/r/20191030131122.8256-1-vincent.whitchurch@axis.com [akpm@linux-foundation.org: tweak layout, per David] Link: http://lkml.kernel.org/r/20191119092642.31799-1-mhocko@kernel.org Fixes: 35fd1eb1e821 ("mm/sparse: abstract sparse buffer allocations") Signed-off-by: Michal Hocko Reported-by: Vincent Whitchurch Debugged-by: Vincent Whitchurch Acked-by: David Hildenbrand Cc: Pavel Tatashin Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 163b4d59cf6c..8526d3bf1e4e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -458,8 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, - PAGE_SIZE, addr, + map = memblock_alloc_try_nid_raw(size, size, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -482,10 +481,13 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - sparsemap_buf = - memblock_alloc_try_nid_raw(size, PAGE_SIZE, - addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + /* + * Pre-allocated buffer is mainly used by __populate_section_memmap + * and we want it to be properly aligned to the section size - this is + * especially the case for VMEMMAP which maps memmap to PMDs + */ + sparsemap_buf = memblock_alloc_try_nid_raw(size, section_map_size(), + addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } -- cgit v1.2.3 From 0ac398b171aacd0f0c132d989ec4efb5de94f34a Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:56:27 -0800 Subject: mm: support memblock alloc on the exact node for sparse_buffer_init() sparse_buffer_init() use memblock_alloc_try_nid_raw() to allocate memory for page management structure, if memory allocation fails from specified node, it will fall back to allocate from other nodes. Normally, the page management structure will not exceed 2% of the total memory, but a large continuous block of allocation is needed. In most cases, memory allocation from the specified node will succeed, but a node memory become highly fragmented will fail. we expect to allocate memory base section rather than by allocating a large block of memory from other NUMA nodes Add memblock_alloc_exact_nid_raw() for this situation, which allocate boot memory block on the exact node. If a large contiguous block memory allocate fail in sparse_buffer_init(), it will fall back to allocate small block memory base section. Link: http://lkml.kernel.org/r/66755ea7-ab10-8882-36fd-3e02b03775d5@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Mike Rapoport Cc: Wei Yang Cc: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 3 +++ mm/memblock.c | 65 ++++++++++++++++++++++++++++++++++++++++-------- mm/sparse.c | 2 +- 3 files changed, 58 insertions(+), 12 deletions(-) (limited to 'mm/sparse.c') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f491690d54c6..b38bbefabfab 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE); } +void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); diff --git a/mm/memblock.c b/mm/memblock.c index 203ed317551b..4bc2c7d8bf42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1319,12 +1319,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * @start: the lower bound of the memory region to allocate (phys address) * @end: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * The allocation is performed from memory region limited by * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. * - * If the specified node can not hold the requested memory the - * allocation falls back to any node in the system + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. * * For systems with memory mirroring, the allocation is attempted first * from the regions with mirroring enabled and then retried from any @@ -1338,7 +1339,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, + bool exact_nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; @@ -1358,7 +1360,7 @@ again: if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE) { + if (nid != NUMA_NO_NODE && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1406,7 +1408,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t start, phys_addr_t end) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + false); } /** @@ -1425,7 +1428,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_range_nid(size, align, 0, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid, false); } /** @@ -1435,6 +1438,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @min_addr: the lower bound of the memory region to allocate (phys address) * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * Allocates memory block using memblock_alloc_range_nid() and * converts the returned physical address to virtual. @@ -1450,7 +1454,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, - int nid) + int nid, bool exact_nid) { phys_addr_t alloc; @@ -1465,11 +1469,13 @@ static void * __init memblock_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, + exact_nid); /* retry allocation without lower limit */ if (!alloc && min_addr) - alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, + exact_nid); if (!alloc) return NULL; @@ -1477,6 +1483,43 @@ static void * __init memblock_alloc_internal( return phys_to_virt(alloc); } +/** + * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node + * without zeroing memory + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_exact_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + ptr = memblock_alloc_internal(size, align, + min_addr, max_addr, nid, true); + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + /** * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking @@ -1508,7 +1551,7 @@ void * __init memblock_alloc_try_nid_raw( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr && size > 0) page_init_poison(ptr, size); @@ -1543,7 +1586,7 @@ void * __init memblock_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr) memset(ptr, 0, size); diff --git a/mm/sparse.c b/mm/sparse.c index 8526d3bf1e4e..b20ab7cdac86 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -486,7 +486,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_try_nid_raw(size, section_map_size(), + sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } -- cgit v1.2.3