From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 16 Nov 2012 14:14:54 -0800 Subject: memcg: fix hotplugged memory zone oops When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: Wen Congyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 50aaca81f63d..a23923ba8263 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -752,7 +752,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size, enum memmap_context context); -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); +extern void lruvec_init(struct lruvec *lruvec); static inline struct zone *lruvec_zone(struct lruvec *lruvec) { -- cgit v1.2.3 From 2ca3cb50edc351875df13d083524f524cdeb3054 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 16 Nov 2012 14:14:56 -0800 Subject: rapidio: fix kernel-doc warnings Fix rapidio kernel-doc warnings: Warning(drivers/rapidio/rio.c:415): No description found for parameter 'local' Warning(drivers/rapidio/rio.c:415): Excess function parameter 'lstart' description in 'rio_map_inb_region' Warning(include/linux/rio.h:290): No description found for parameter 'switches' Warning(include/linux/rio.h:290): No description found for parameter 'destid_table' Signed-off-by: Randy Dunlap Cc: Matt Porter Acked-by: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 2 +- include/linux/rio.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index c17ae22567e0..0c6fcb461faf 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -401,7 +401,7 @@ EXPORT_SYMBOL_GPL(rio_release_inb_pwrite); /** * rio_map_inb_region -- Map inbound memory region. * @mport: Master port. - * @lstart: physical address of memory region to be mapped + * @local: physical address of memory region to be mapped * @rbase: RIO base address assigned to this window * @size: Size of the memory region * @rflags: Flags for mapping. diff --git a/include/linux/rio.h b/include/linux/rio.h index 4187da511006..a3e784278667 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -275,9 +275,11 @@ struct rio_id_table { * struct rio_net - RIO network info * @node: Node in global list of RIO networks * @devices: List of devices in this network + * @switches: List of switches in this netowrk * @mports: List of master ports accessing this network * @hport: Default port for accessing this network * @id: RIO network ID + * @destid_table: destID allocation table */ struct rio_net { struct list_head node; /* node in list of networks */ -- cgit v1.2.3 From 5576646f3c1abd60d72d19829de6f5d8c2ca8ecf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 16 Nov 2012 14:15:06 -0800 Subject: revert "mm: fix-up zone present pages" Revert commit 7f1290f2f2a4 ("mm: fix-up zone present pages") That patch tried to fix a issue when calculating zone->present_pages, but it caused a regression on 32bit systems with HIGHMEM. With that change, reset_zone_present_pages() resets all zone->present_pages to zero, and fixup_zone_present_pages() is called to recalculate zone->present_pages when the boot allocator frees core memory pages into buddy allocator. Because highmem pages are not freed by bootmem allocator, all highmem zones' present_pages becomes zero. Various options for improving the situation are being discussed but for now, let's return to the 3.6 code. Cc: Jianguo Wu Cc: Jiang Liu Cc: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Acked-by: David Rientjes Tested-by: Chris Clayton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 1 - include/linux/mm.h | 4 ---- mm/bootmem.c | 10 +--------- mm/memory_hotplug.c | 7 ------- mm/nobootmem.c | 3 --- mm/page_alloc.c | 34 ---------------------------------- 6 files changed, 1 insertion(+), 58 deletions(-) (limited to 'include') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index acd5b68e8871..082e383c1b6f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -637,7 +637,6 @@ mem_init (void) high_memory = __va(max_low_pfn * PAGE_SIZE); - reset_zone_present_pages(); for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) totalram_pages += free_all_bootmem_node(pgdat); diff --git a/include/linux/mm.h b/include/linux/mm.h index fa0680402738..bcaab4e6fe91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1684,9 +1684,5 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ -extern void reset_zone_present_pages(void); -extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn); - #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 434be4ae7a04..f468185b3b28 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); - fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), - start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); - fixup_zone_present_pages( - page_to_nid(page), - start + off, start + off + 1); count++; } vec >>= 1; @@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) { - fixup_zone_present_pages(page_to_nid(page), - page_to_pfn(page), page_to_pfn(page) + 1); + while (pages--) __free_pages_bootmem(page++, 0); - } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 56b758ae57d2..e4eeacae2b91 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; - struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); - - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages++; - zone_span_writeunlock(zone); - totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 714d5d650470..bd82f6b31411 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); - fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), - start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; - reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c91598b1b4c0..7bb35ac0964a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,37 +6098,3 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } - -/* reset zone->present_pages */ -void reset_zone_present_pages(void) -{ - struct zone *z; - int i, nid; - - for_each_node_state(nid, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - z->present_pages = 0; - } - } -} - -/* calculate zone's present pages in buddy system */ -void fixup_zone_present_pages(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - struct zone *z; - unsigned long zone_start_pfn, zone_end_pfn; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) { - z = NODE_DATA(nid)->node_zones + i; - zone_start_pfn = z->zone_start_pfn; - zone_end_pfn = zone_start_pfn + z->spanned_pages; - - /* if the two regions intersect */ - if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) - z->present_pages += min(end_pfn, zone_end_pfn) - - max(start_pfn, zone_start_pfn); - } -} -- cgit v1.2.3