From 4f7c6b49c45a398d72763d1f0e64ddff8b3653c7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 6 Aug 2014 16:05:13 -0700 Subject: mem-hotplug: introduce MMOP_OFFLINE to replace the hard coding -1 In store_mem_state(), we have: ... 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) 335 online_type = -1; ... 355 case -1: 356 ret = device_offline(&mem->dev); 357 break; ... Here, "offline" is hard coded as -1. This patch does the following renaming: ONLINE_KEEP -> MMOP_ONLINE_KEEP ONLINE_KERNEL -> MMOP_ONLINE_KERNEL ONLINE_MOVABLE -> MMOP_ONLINE_MOVABLE and introduces MMOP_OFFLINE = -1 to avoid hard coding. Signed-off-by: Tang Chen Cc: Hu Tao Cc: Greg Kroah-Hartman Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010d125bffbf..79dd9eca054f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -26,11 +26,12 @@ enum { MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; -/* Types for control the zone type of onlined memory */ +/* Types for control the zone type of onlined and offlined memory */ enum { - ONLINE_KEEP, - ONLINE_KERNEL, - ONLINE_MOVABLE, + MMOP_OFFLINE = -1, + MMOP_ONLINE_KEEP, + MMOP_ONLINE_KERNEL, + MMOP_ONLINE_MOVABLE, }; /* -- cgit v1.2.3 From 6326440077a48d2c3b2993f3b3f2d969f09b6917 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:36 -0700 Subject: memory-hotplug: add zone_for_memory() for selecting zone for new memory This series of patches fixes a problem when adding memory in bad manner. For example: for a x86_64 machine booted with "mem=400M" and with 2GiB memory installed, following commands cause problem: # echo 0x40000000 > /sys/devices/system/memory/probe [ 28.613895] init_memory_mapping: [mem 0x40000000-0x47ffffff] # echo 0x48000000 > /sys/devices/system/memory/probe [ 28.693675] init_memory_mapping: [mem 0x48000000-0x4fffffff] # echo online_movable > /sys/devices/system/memory/memory9/state # echo 0x50000000 > /sys/devices/system/memory/probe [ 29.084090] init_memory_mapping: [mem 0x50000000-0x57ffffff] # echo 0x58000000 > /sys/devices/system/memory/probe [ 29.151880] init_memory_mapping: [mem 0x58000000-0x5fffffff] # echo online_movable > /sys/devices/system/memory/memory11/state # echo online> /sys/devices/system/memory/memory8/state # echo online> /sys/devices/system/memory/memory10/state # echo offline> /sys/devices/system/memory/memory9/state [ 30.558819] Offlined Pages 32768 # free total used free shared buffers cached Mem: 780588 18014398509432020 830552 0 0 51180 -/+ buffers/cache: 18014398509380840 881732 Swap: 0 0 0 This is because the above commands probe higher memory after online a section with online_movable, which causes ZONE_HIGHMEM (or ZONE_NORMAL for systems without ZONE_HIGHMEM) overlaps ZONE_MOVABLE. After the second online_movable, the problem can be observed from zoneinfo: # cat /proc/zoneinfo ... Node 0, zone Movable pages free 65491 min 250 low 312 high 375 scanned 0 spanned 18446744073709518848 present 65536 managed 65536 ... This series of patches solve the problem by checking ZONE_MOVABLE when choosing zone for new memory. If new memory is inside or higher than ZONE_MOVABLE, makes it go there instead. After applying this series of patches, following are free and zoneinfo result (after offlining memory9): bash-4.2# free total used free shared buffers cached Mem: 780956 80112 700844 0 0 51180 -/+ buffers/cache: 28932 752024 Swap: 0 0 0 bash-4.2# cat /proc/zoneinfo Node 0, zone DMA pages free 3389 min 14 low 17 high 21 scanned 0 spanned 4095 present 3998 managed 3977 nr_free_pages 3389 ... start_pfn: 1 inactive_ratio: 1 Node 0, zone DMA32 pages free 73724 min 341 low 426 high 511 scanned 0 spanned 98304 present 98304 managed 92958 nr_free_pages 73724 ... start_pfn: 4096 inactive_ratio: 1 Node 0, zone Normal pages free 32630 min 120 low 150 high 180 scanned 0 spanned 32768 present 32768 managed 32768 nr_free_pages 32630 ... start_pfn: 262144 inactive_ratio: 1 Node 0, zone Movable pages free 65476 min 241 low 301 high 361 scanned 0 spanned 98304 present 65536 managed 65536 nr_free_pages 65476 ... start_pfn: 294912 inactive_ratio: 1 This patch (of 7): Introduce zone_for_memory() in arch independent code for arch_add_memory() use. Many arch_add_memory() function simply selects ZONE_HIGHMEM or ZONE_NORMAL and add new memory into it. However, with the existance of ZONE_MOVABLE, the selection method should be carefully considered: if new, higher memory is added after ZONE_MOVABLE is setup, the default zone and ZONE_MOVABLE may overlap each other. should_add_memory_movable() checks the status of ZONE_MOVABLE. If it has already contain memory, compare the address of new memory and movable memory. If new memory is higher than movable, it should be added into ZONE_MOVABLE instead of default zone. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + mm/memory_hotplug.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 79dd9eca054f..d9524c49d767 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -259,6 +259,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a3797d3fd8a4..2ff8c2325e96 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1159,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { -- cgit v1.2.3 From ed2f240094f900833ac06f533ab8bbcf0a1e8199 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Thu, 9 Oct 2014 15:26:31 -0700 Subject: memory-hotplug: add sysfs valid_zones attribute Currently memory-hotplug has two limits: 1. If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE, but this memory block must be adjacent to ZONE_MOVABLE. 2. If the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL, but this memory block must be adjacent to ZONE_NORMAL. With this patch, we can easy to know a memory block can be onlined to which zone, and don't need to know the above two limits. Updated the related Documentation. [akpm@linux-foundation.org: use conventional comment layout] [akpm@linux-foundation.org: fix build with CONFIG_MEMORY_HOTREMOVE=n] [akpm@linux-foundation.org: remove unused local zone_prev] Signed-off-by: Zhang Zhen Cc: Dave Hansen Cc: David Rientjes Cc: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Naoya Horiguchi Cc: Wang Nan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-devices-memory | 8 +++++ Documentation/memory-hotplug.txt | 11 ++++++- drivers/base/memory.c | 42 ++++++++++++++++++++++++++ include/linux/memory_hotplug.h | 1 + mm/memory_hotplug.c | 2 +- 5 files changed, 62 insertions(+), 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 7405de26ee60..deef3b5723cf 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -61,6 +61,14 @@ Users: hotplug memory remove tools http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils +What: /sys/devices/system/memory/memoryX/valid_zones +Date: July 2014 +Contact: Zhang Zhen +Description: + The file /sys/devices/system/memory/memoryX/valid_zones is + read-only and is designed to show which zone this memory + block can be onlined to. + What: /sys/devices/system/memoryX/nodeY Date: October 2009 Contact: Linux Memory Management list diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 45134dc23854..ea03abfc97e9 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -155,6 +155,7 @@ Under each memory block, you can see 4 files: /sys/devices/system/memory/memoryXXX/phys_device /sys/devices/system/memory/memoryXXX/state /sys/devices/system/memory/memoryXXX/removable +/sys/devices/system/memory/memoryXXX/valid_zones 'phys_index' : read-only and contains memory block id, same as XXX. 'state' : read-write @@ -170,6 +171,15 @@ Under each memory block, you can see 4 files: block is removable and a value of 0 indicates that it is not removable. A memory block is removable only if every section in the block is removable. +'valid_zones' : read-only: designed to show which zones this memory block + can be onlined to. + The first column shows it's default zone. + "memory6/valid_zones: Normal Movable" shows this memoryblock + can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE + by online_movable. + "memory7/valid_zones: Movable Normal" shows this memoryblock + can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL + by online_kernel. NOTE: These directories/files appear after physical memory hotplug phase. @@ -408,7 +418,6 @@ node if necessary. - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. - showing memory block and physical device relationship. - - showing memory block is under ZONE_MOVABLE or not - test and make it better memory offlining. - support HugeTLB page migration and offlining. - memmap removing at memory offline. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a2e13e250bba..7c5d87191b28 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -373,6 +373,45 @@ static ssize_t show_phys_device(struct device *dev, return sprintf(buf, "%d\n", mem->phys_device); } +#ifdef CONFIG_MEMORY_HOTREMOVE +static ssize_t show_valid_zones(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct memory_block *mem = to_memory_block(dev); + unsigned long start_pfn, end_pfn; + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + struct page *first_page; + struct zone *zone; + + start_pfn = section_nr_to_pfn(mem->start_section_nr); + end_pfn = start_pfn + nr_pages; + first_page = pfn_to_page(start_pfn); + + /* The block contains more than one zone can not be offlined. */ + if (!test_pages_in_a_zone(start_pfn, end_pfn)) + return sprintf(buf, "none\n"); + + zone = page_zone(first_page); + + if (zone_idx(zone) == ZONE_MOVABLE - 1) { + /*The mem block is the last memoryblock of this zone.*/ + if (end_pfn == zone_end_pfn(zone)) + return sprintf(buf, "%s %s\n", + zone->name, (zone + 1)->name); + } + + if (zone_idx(zone) == ZONE_MOVABLE) { + /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ + if (start_pfn == zone->zone_start_pfn) + return sprintf(buf, "%s %s\n", + zone->name, (zone - 1)->name); + } + + return sprintf(buf, "%s\n", zone->name); +} +static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); +#endif + static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); @@ -523,6 +562,9 @@ static struct attribute *memory_memblk_attrs[] = { &dev_attr_state.attr, &dev_attr_phys_device.attr, &dev_attr_removable.attr, +#ifdef CONFIG_MEMORY_HOTREMOVE + &dev_attr_valid_zones.attr, +#endif NULL }; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d9524c49d767..8f1a41951df9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -84,6 +84,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long, int); +extern int test_pages_in_a_zone(unsigned long, unsigned long); extern void __offline_isolated_pages(unsigned long, unsigned long); typedef void (*online_page_callback_t)(struct page *page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2ff8c2325e96..29d8693d0c61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* * Confirm all pages in a range [start, end) is belongs to the same zone. */ -static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct zone *zone = NULL; -- cgit v1.2.3