From 956f8b445061667c3545baa24778f890d1d522f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:16 -0700 Subject: drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE Patch series "mm/memory_hotplug: allow to specify a default online_type", v3. Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules. Every time we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process. To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" === Example /usr/libexec/config-memhotplug === #!/bin/bash VIRT=`systemd-detect-virt --vm` ARCH=`uname -p` sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 } if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 onlines all hotplugged memory right from the kernel ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi echo "Selected online_type:" $ONLINE_TYPE # Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi # Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi === Example /usr/lib/systemd/system/config-memhotplug.service === [Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks [Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes [Install] WantedBy=sysinit.target === Example modification to the 40-redhat.rules [2] === : diff --git a/40-redhat.rules b/40-redhat.rules-new : index 2c690e5..168fd03 100644 : --- a/40-redhat.rules : +++ b/40-redhat.rules-new : @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} : # Memory hotadd request : SUBSYSTEM!="memory", GOTO="memory_hotplug_end" : ACTION!="add", GOTO="memory_hotplug_end" : +# memory hotplug behavior configured : +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" : + : PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" : : ENV{.state}="online" === [1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules This patch (of 8): The name is misleading and it's not really clear what is "kept". Let's just name it like the online_type name we expose to user space ("online"). Add some documentation to the types. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Vitaly Kuznetsov Cc: Yumei Huang Cc: Igor Mammedov Cc: Eduardo Habkost Cc: Benjamin Herrenschmidt Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Michael Ellerman (powerpc) Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Wei Liu Link: http://lkml.kernel.org/r/20200319131221.14044-1-david@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-2-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d4..261dbf010d5d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { + /* Offline the memory. */ MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, }; -- cgit v1.2.3 From efc978ad0e05ed6401c7854811750bf55b67f4b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:20 -0700 Subject: drivers/base/memory: map MMOP_OFFLINE to 0 Historically, we used the value -1. Just treat 0 as the special case now. Clarify a comment (which was wrong, when we come via device_online() the first time, the online_type would have been 0 / MEM_ONLINE). The default is now always MMOP_OFFLINE. This removes the last user of the manual "-1", which didn't use the enum value. This is a preparation to use the online_type as an array index. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-3-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 ++++------- include/linux/memory_hotplug.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 156b89b14fcc..f65f3d53dc64 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -203,17 +203,14 @@ static int memory_subsys_online(struct device *dev) return 0; /* - * If we are called from state_store(), online_type will be - * set >= 0 Otherwise we were called from the device online - * attribute and need to set the online_type. + * When called via device_online() without configuring the online_type, + * we want to default to MMOP_ONLINE. */ - if (mem->online_type < 0) + if (mem->online_type == MMOP_OFFLINE) mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - - /* clear online_type */ - mem->online_type = -1; + mem->online_type = MMOP_OFFLINE; return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 261dbf010d5d..c2e06ed5e0e9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -48,7 +48,7 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ - MMOP_OFFLINE = -1, + MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ MMOP_ONLINE, /* Online the memory to ZONE_NORMAL. */ -- cgit v1.2.3 From 862919e568356cc36288a11b42cd88ec3a7100e9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:40 -0700 Subject: mm/memory_hotplug: convert memhp_auto_online to store an online_type ... and rename it to memhp_default_online_type. This is a preparation for more detailed default online behavior. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-8-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 ++++------ include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 11 ++++++----- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1c90bdf60d85..7d2f829d00d7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -378,10 +378,8 @@ static DEVICE_ATTR_RO(block_size_bytes); static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (memhp_auto_online) - return sprintf(buf, "online\n"); - else - return sprintf(buf, "offline\n"); + return sprintf(buf, "%s\n", + online_type_to_str[memhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, @@ -389,9 +387,9 @@ static ssize_t auto_online_blocks_store(struct device *dev, const char *buf, size_t count) { if (sysfs_streq(buf, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; else return -EINVAL; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c2e06ed5e0e9..c6e090b34c4b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; -extern bool memhp_auto_online; +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9691cbb4383e..9436f7e6257a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,17 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { if (!strcmp(str, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (!strcmp(str, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; return 1; } @@ -990,6 +990,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); } @@ -1062,7 +1063,7 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_done(); /* online pages if requested */ - if (memhp_auto_online) + if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; -- cgit v1.2.3 From 5f47adf762b78cae97de58d9ff01d2d44db09467 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:44 -0700 Subject: mm/memory_hotplug: allow to specify a default online_type For now, distributions implement advanced udev rules to essentially - Don't online any hotplugged memory (s390x) - Online all memory to ZONE_NORMAL (e.g., most virt environments like hyperv) - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken care of (e.g., bare metal, special virt environments) In summary: All memory is usually onlined the same way, however, the kernel always has to ask user space to come up with the same answer. E.g., Hyper-V always waits for a memory block to get onlined before continuing, otherwise it might end up adding memory faster than onlining it, which can result in strange OOM situations. This waiting slows down adding of a bigger amount of memory. Let's allow to specify a default online_type, not just "online" and "offline". This allows distributions to configure the default online_type when booting up and be done with it. We can now specify "offline", "online", "online_movable" and "online_kernel" via - "memhp_default_state=" on the kernel cmdline - /sys/devices/system/memory/auto_online_blocks just like we are able to specify for a single memory block via /sys/devices/system/memory/memoryX/state Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-9-david@redhat.com Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 +++++------ include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7d2f829d00d7..dbec3a05590a 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,7 +34,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", }; -static int memhp_online_type_from_str(const char *str) +int memhp_online_type_from_str(const char *str) { int i; @@ -386,13 +386,12 @@ static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - memhp_default_online_type = MMOP_OFFLINE; - else + const int online_type = memhp_online_type_from_str(buf); + + if (online_type < 0) return -EINVAL; + memhp_default_online_type = online_type; return count; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c6e090b34c4b..ef55115320fb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; +extern int memhp_online_type_from_str(const char *str); + /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9436f7e6257a..2fb78c5ebaf3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -74,10 +74,10 @@ int memhp_default_online_type = MMOP_ONLINE; static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (!strcmp(str, "offline")) - memhp_default_online_type = MMOP_OFFLINE; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } -- cgit v1.2.3 From 96c6b598135e7cec66161e8943823470c7c8954e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:17 -0700 Subject: mm/memory_hotplug: drop the flags field from struct mhp_restrictions Patch series "Allow setting caching mode in arch_add_memory() for P2PDMA", v4. Currently, the page tables created using memremap_pages() are always created with the PAGE_KERNEL cacheing mode. However, the P2PDMA code is creating pages for PCI BAR memory which should never be accessed through the cache and instead use either WC or UC. This still works in most cases, on x86, because the MTRR registers typically override the caching settings in the page tables for all of the IO memory to be UC-. However, this tends not to work so well on other arches or some rare x86 machines that have firmware which does not setup the MTRR registers in this way. Instead of this, this series proposes a change to arch_add_memory() to take the pgprot required by the mapping which allows us to explicitly set pagetable entries for P2PDMA memory to UC. This changes is pretty routine for most of the arches: x86_64, arm64 and powerpc simply need to thread the pgprot through to where the page tables are setup. x86_32 unfortunately sets up the page tables at boot so must use _set_memory_prot() to change their caching mode. ia64, s390 and sh don't appear to have an easy way to change the page tables so, for now at least, we just return -EINVAL on such mappings and thus they will not support P2PDMA memory until the work for this is done. This should be fine as they don't yet support ZONE_DEVICE. This patch (of 7): This variable is not used anywhere and should therefore be removed from the structure. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Catalin Marinas Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Link: http://lkml.kernel.org/r/20200306170846.9333-2-logang@deltatee.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ef55115320fb..7c1bcff11672 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -59,11 +59,9 @@ enum { /* * Restrictions for the memory hotplug: - * flags: MHP_ flags * altmap: alternative allocator for memmap array */ struct mhp_restrictions { - unsigned long flags; struct vmem_altmap *altmap; }; -- cgit v1.2.3 From f5637d3b42ab0465ef71d5fb8461bce97fba95e8 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:21 -0700 Subject: mm/memory_hotplug: rename mhp_restrictions to mhp_params The mhp_restrictions struct really doesn't specify anything resembling a restriction anymore so rename it to be mhp_params as it is a list of extended parameters. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20200306170846.9333-3-logang@deltatee.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 4 ++-- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 6 +++--- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 8 ++++---- include/linux/memory_hotplug.h | 16 ++++++++-------- mm/memory_hotplug.c | 8 ++++---- mm/memremap.c | 8 ++++---- 10 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9b08f7c7e6f0..6d4e9c2b4ed0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1374,7 +1374,7 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) } int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { int ret, flags = 0; @@ -1387,7 +1387,7 @@ int arch_add_memory(int nid, u64 start, u64 size, memblock_clear_nomap(start, size); ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, - restrictions); + params); if (ret) __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b01d68a2d5d9..97bbc23ea1e3 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -670,13 +670,13 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, restrictions); + ret = __add_pages(nid, start_pfn, nr_pages, params); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9b4f5fb719e0..e1cc58115816 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -122,7 +122,7 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, } int __ref arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -138,7 +138,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, restrictions); + return __add_pages(nid, start_pfn, nr_pages, params); } void __ref arch_remove_memory(int nid, u64 start, u64 size, diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ac44bd76db4b..e9e4a7abd0cc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -268,20 +268,20 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(restrictions->altmap)) + if (WARN_ON_ONCE(params->altmap)) return -EINVAL; rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, restrictions); + rc = __add_pages(nid, start_pfn, size_pages, params); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d1b1ff2be17a..e5114c053364 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -406,14 +406,14 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, restrictions); + ret = __add_pages(nid, start_pfn, nr_pages, params); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index de73992b8432..d736c8625503 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -819,12 +819,12 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, restrictions); + return __add_pages(nid, start_pfn, nr_pages, params); } void arch_remove_memory(int nid, u64 start, u64 size, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0a14711d3a93..faa86a9a3b0d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -843,11 +843,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, restrictions); + ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -858,14 +858,14 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, } int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, restrictions); + return add_pages(nid, start_pfn, nr_pages, params); } #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7c1bcff11672..75f0f6304735 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -58,10 +58,10 @@ enum { }; /* - * Restrictions for the memory hotplug: - * altmap: alternative allocator for memmap array + * Extended parameters for memory hotplug: + * altmap: alternative allocator for memmap array (optional) */ -struct mhp_restrictions { +struct mhp_params { struct vmem_altmap *altmap; }; @@ -112,7 +112,7 @@ extern int restore_online_page_callback(online_page_callback_t callback); extern int try_online_node(int nid); extern int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions); + struct mhp_params *params); extern u64 max_mem_size; extern int memhp_online_type_from_str(const char *str); @@ -133,17 +133,17 @@ extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions); + struct mhp_params *params); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct mhp_restrictions *restrictions) + unsigned long nr_pages, struct mhp_params *params) { - return __add_pages(nid, start_pfn, nr_pages, restrictions); + return __add_pages(nid, start_pfn, nr_pages, params); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions); + struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 635e8e286598..fbfe7b40f552 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -304,12 +304,12 @@ static int check_hotplug_memory_addressable(unsigned long pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; int err; - struct vmem_altmap *altmap = restrictions->altmap; + struct vmem_altmap *altmap = params->altmap; err = check_hotplug_memory_addressable(pfn, nr_pages); if (err) @@ -1002,7 +1002,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_restrictions restrictions = {}; + struct mhp_params params = {}; u64 start, size; bool new_node = false; int ret; @@ -1030,7 +1030,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, &restrictions); + ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; diff --git a/mm/memremap.c b/mm/memremap.c index bbf457c4f166..b0b5170843ff 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -184,7 +184,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; - struct mhp_restrictions restrictions = { + struct mhp_params params = { /* * We do not want any optional features only our own memmap */ @@ -302,7 +302,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), &restrictions); + PHYS_PFN(resource_size(res)), ¶ms); } else { error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); if (error) { @@ -311,7 +311,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } error = arch_add_memory(nid, res->start, resource_size(res), - &restrictions); + ¶ms); } if (!error) { @@ -319,7 +319,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), restrictions.altmap); + PHYS_PFN(resource_size(res)), params.altmap); } mem_hotplug_done(); -- cgit v1.2.3 From bfeb022f8fe4c5afdcfd7a3d868fac9765f9bcad Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:36 -0700 Subject: mm/memory_hotplug: add pgprot_t to mhp_params devm_memremap_pages() is currently used by the PCI P2PDMA code to create struct page mappings for IO memory. At present, these mappings are created with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on x86, an mtrr register will typically override this and force the cache type to be UC-. In the case firmware doesn't set this register it is effectively WB and will typically result in a machine check exception when it's accessed. Other arches are not currently likely to function correctly seeing they don't have any MTRR registers to fall back on. To solve this, provide a way to specify the pgprot value explicitly to arch_add_memory(). Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a simple change to pass the pgprot_t down to their respective functions which set up the page tables. For x86_32, set the page tables explicitly using _set_memory_prot() (seeing they are already mapped). For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this should be fine, for now, seeing these architectures don't support ZONE_DEVICE. A check in __add_pages() is also added to ensure the pgprot parameter was set for all arches. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Dan Williams Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 3 ++- arch/ia64/mm/init.c | 3 +++ arch/powerpc/mm/mem.c | 3 ++- arch/s390/mm/init.c | 3 +++ arch/sh/mm/init.c | 3 +++ arch/x86/mm/init_32.c | 12 ++++++++++++ arch/x86/mm/init_64.c | 2 +- include/linux/memory_hotplug.h | 3 +++ mm/memory_hotplug.c | 5 ++++- mm/memremap.c | 6 +++--- 10 files changed, 36 insertions(+), 7 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6d4e9c2b4ed0..a374e4f51a62 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1382,7 +1382,8 @@ int arch_add_memory(int nid, u64 start, u64 size, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); + size, params->pgprot, __pgd_pgtable_alloc, + flags); memblock_clear_nomap(start, size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 97bbc23ea1e3..d637b4ea3147 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -676,6 +676,9 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long nr_pages = size >> PAGE_SHIFT; int ret; + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + return -EINVAL; + ret = __add_pages(nid, start_pfn, nr_pages, params); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index bf63ab04db63..041ed7cfd341 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -132,7 +132,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, resize_hpt_for_hotplug(memblock_phys_mem_size()); start = (unsigned long)__va(start); - rc = create_section_mapping(start, start + size, nid, PAGE_KERNEL); + rc = create_section_mapping(start, start + size, nid, + params->pgprot); if (rc) { pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n", start, start + size, rc); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index e9e4a7abd0cc..87b2d024e75a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -277,6 +277,9 @@ int arch_add_memory(int nid, u64 start, u64 size, if (WARN_ON_ONCE(params->altmap)) return -EINVAL; + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) + return -EINVAL; + rc = vmem_add_mapping(start, size); if (rc) return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index e5114c053364..b9de2d4fa57e 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -412,6 +412,9 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long nr_pages = size >> PAGE_SHIFT; int ret; + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot) + return -EINVAL; + /* We only have ZONE_NORMAL, so this is easy.. */ ret = __add_pages(nid, start_pfn, nr_pages, params); if (unlikely(ret)) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac75a8397804..4222a010057a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -824,6 +824,18 @@ int arch_add_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + /* + * The page tables were already mapped at boot so if the caller + * requests a different mapping type then we must change all the + * pages with __set_memory_prot(). + */ + if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) { + ret = __set_memory_prot(start, nr_pages, params->pgprot); + if (ret) + return ret; + } return __add_pages(nid, start_pfn, nr_pages, params); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7480de743105..3b289c2f75cd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -867,7 +867,7 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - init_memory_mapping(start, start + size, PAGE_KERNEL); + init_memory_mapping(start, start + size, params->pgprot); return add_pages(nid, start_pfn, nr_pages, params); } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 75f0f6304735..93d9ada74ddd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -60,9 +60,12 @@ enum { /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) + * pgprot: page protection flags to apply to newly created page tables + * (required) */ struct mhp_params { struct vmem_altmap *altmap; + pgprot_t pgprot; }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fbfe7b40f552..fc0aad0bc1f5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -311,6 +311,9 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, int err; struct vmem_altmap *altmap = params->altmap; + if (WARN_ON_ONCE(!params->pgprot.pgprot)) + return -EINVAL; + err = check_hotplug_memory_addressable(pfn, nr_pages); if (err) return err; @@ -1002,7 +1005,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_params params = {}; + struct mhp_params params = { .pgprot = PAGE_KERNEL }; u64 start, size; bool new_node = false; int ret; diff --git a/mm/memremap.c b/mm/memremap.c index b0b5170843ff..bc167cde3237 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -189,8 +189,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) * We do not want any optional features only our own memmap */ .altmap = pgmap_altmap(pgmap), + .pgprot = PAGE_KERNEL, }; - pgprot_t pgprot = PAGE_KERNEL; int error, is_ram; bool need_devmap_managed = true; @@ -282,8 +282,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, - resource_size(res)); + error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(res->start), + 0, resource_size(res)); if (error) goto err_pfn_remap; -- cgit v1.2.3