diff options
author | Dan Williams <dan.j.williams@intel.com> | 2020-02-16 23:01:09 +0300 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2020-02-18 21:28:05 +0300 |
commit | 5d30f92e7631286b8617777c5400c8eadcae50a1 (patch) | |
tree | bd82f903af8f728682556e7e09a9ae8daf9b8f0a | |
parent | 1e5d8e1e47afde23e3249aed25d7d124feff5c1c (diff) | |
download | linux-5d30f92e7631286b8617777c5400c8eadcae50a1.tar.xz |
x86/NUMA: Provide a range-to-target_node lookup facility
The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax
instances, fronting performance-differentiated-memory like pmem, to be
added to the System RAM pool. The NUMA node for that hot-added memory is
derived from the device-dax instance's 'target_node' attribute.
Recall that the 'target_node' is the ACPI-PXM-to-node translation for
memory when it comes online whereas the 'numa_node' attribute of the
device represents the closest online cpu node.
Presently useful target_node information from the ACPI SRAT is discarded
with the expectation that "Reserved" memory will never be onlined. Now,
DEV_DAX_KMEM violates that assumption, there is a need to retain the
translation. Move, rather than discard, numa_memblk data to a secondary
array that memory_add_physaddr_to_target_node() may consider at a later
point in time.
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com
-rw-r--r-- | arch/x86/mm/numa.c | 61 | ||||
-rw-r--r-- | include/linux/numa.h | 14 |
2 files changed, 64 insertions, 11 deletions
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2450b21cc28a..59ba008504dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; @@ -165,6 +166,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) } /** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +/** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk * @start: Start address of the new memblk @@ -233,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - /* make sure all blocks are inside the limits */ + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty or non-exist block */ - if (bi->start >= bi->end || - !memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) + /* and there's no empty block */ + if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); } @@ -877,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) { - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; int i; for (i = 0; i < mi->nr_blks; i++) if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(phys_addr_t start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); diff --git a/include/linux/numa.h b/include/linux/numa.h index 5773cd2613fc..a42df804679e 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H - +#include <linux/types.h> #ifdef CONFIG_NODES_SHIFT #define NODES_SHIFT CONFIG_NODES_SHIFT @@ -21,12 +21,24 @@ #endif #ifdef CONFIG_NUMA +/* Generic implementation available */ int numa_map_to_online_node(int node); + +/* + * Optional architecture specific implementation, users need a "depends + * on $ARCH" + */ +int phys_to_target_node(phys_addr_t addr); #else static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } + +static inline int phys_to_target_node(phys_addr_t addr) +{ + return NUMA_NO_NODE; +} #endif #endif /* _LINUX_NUMA_H */ |