diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 533 | 
1 files changed, 417 insertions, 116 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb64f7..59de90d5d3a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {  #endif  }; +char * const migratetype_names[MIGRATE_TYPES] = { +	"Unmovable", +	"Movable", +	"Reclaimable", +	"HighAtomic", +#ifdef CONFIG_CMA +	"CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION +	"Isolate", +#endif +}; +  compound_page_dtor * const compound_page_dtors[] = {  	NULL,  	free_compound_page, @@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {  int min_free_kbytes = 1024;  int user_min_free_kbytes = -1; +int watermark_scale_factor = 10;  static unsigned long __meminitdata nr_kernel_pages;  static unsigned long __meminitdata nr_all_pages; @@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];  static unsigned long __initdata required_kernelcore;  static unsigned long __initdata required_movablecore;  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore;  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */  int movable_zone; @@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,  				unsigned long pfn, unsigned long zone_end,  				unsigned long *nr_initialised)  { +	unsigned long max_initialise; +  	/* Always populate low zones for address-contrained allocations */  	if (zone_end < pgdat_end_pfn(pgdat))  		return true; +	/* +	 * Initialise at least 2G of a node but also take into account that +	 * two large system hashes that can take up 1GB for 0.25TB/node. +	 */ +	max_initialise = max(2UL << (30 - PAGE_SHIFT), +		(pgdat->node_spanned_pages >> 8)); -	/* Initialise at least 2G of the highest zone */  	(*nr_initialised)++; -	if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && +	if ((*nr_initialised > max_initialise) &&  	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {  		pgdat->first_deferred_pfn = pfn;  		return false; @@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason,  			goto out;  		}  		if (nr_unshown) { -			printk(KERN_ALERT +			pr_alert(  			      "BUG: Bad page state: %lu messages suppressed\n",  				nr_unshown);  			nr_unshown = 0; @@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason,  	if (nr_shown++ == 0)  		resume = jiffies + 60 * HZ; -	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n", +	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",  		current->comm, page_to_pfn(page)); -	dump_page_badflags(page, reason, bad_flags); +	__dump_page(page, reason); +	bad_flags &= page->flags; +	if (bad_flags) +		pr_alert("bad because of flags: %#lx(%pGp)\n", +						bad_flags, &bad_flags); +	dump_page_owner(page);  	print_modules();  	dump_stack(); @@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order)  #ifdef CONFIG_DEBUG_PAGEALLOC  unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly +			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled);  bool _debug_guardpage_enabled __read_mostly;  static int __init early_debug_pagealloc(char *buf) @@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf)  	if (strcmp(buf, "on") == 0)  		_debug_pagealloc_enabled = true; +	if (strcmp(buf, "off") == 0) +		_debug_pagealloc_enabled = false; +  	return 0;  }  early_param("debug_pagealloc", early_debug_pagealloc); @@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)  	unsigned long res;  	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) { -		printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); +		pr_err("Bad debug_guardpage_minorder value\n");  		return 0;  	}  	_debug_guardpage_minorder = res; -	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); +	pr_info("Setting debug_guardpage_minorder to %lu\n", res);  	return 0;  }  __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page,  	unsigned long combined_idx;  	unsigned long uninitialized_var(buddy_idx);  	struct page *buddy; -	unsigned int max_order = MAX_ORDER; +	unsigned int max_order; + +	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);  	VM_BUG_ON(!zone_is_initialized(zone));  	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);  	VM_BUG_ON(migratetype == -1); -	if (is_migrate_isolate(migratetype)) { -		/* -		 * We restrict max order of merging to prevent merge -		 * between freepages on isolate pageblock and normal -		 * pageblock. Without this, pageblock isolation -		 * could cause incorrect freepage accounting. -		 */ -		max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); -	} else { +	if (likely(!is_migrate_isolate(migratetype)))  		__mod_zone_freepage_state(zone, 1 << order, migratetype); -	} -	page_idx = pfn & ((1 << max_order) - 1); +	page_idx = pfn & ((1 << MAX_ORDER) - 1);  	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);  	VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging:  	while (order < max_order - 1) {  		buddy_idx = __find_buddy_index(page_idx, order);  		buddy = page + (buddy_idx - page_idx);  		if (!page_is_buddy(page, buddy, order)) -			break; +			goto done_merging;  		/*  		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,  		 * merge with it and move up one order. @@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page,  		page_idx = combined_idx;  		order++;  	} +	if (max_order < MAX_ORDER) { +		/* If we are here, it means order is >= pageblock_order. +		 * We want to prevent merge between freepages on isolate +		 * pageblock and normal pageblock. Without this, pageblock +		 * isolation could cause incorrect freepage or CMA accounting. +		 * +		 * We don't want to hit this code for the more frequent +		 * low-order merging. +		 */ +		if (unlikely(has_isolate_pageblock(zone))) { +			int buddy_mt; + +			buddy_idx = __find_buddy_index(page_idx, order); +			buddy = page + (buddy_idx - page_idx); +			buddy_mt = get_pageblock_migratetype(buddy); + +			if (migratetype != buddy_mt +					&& (is_migrate_isolate(migratetype) || +						is_migrate_isolate(buddy_mt))) +				goto done_merging; +		} +		max_order++; +		goto continue_merging; +	} + +done_merging:  	set_page_order(page, order);  	/* @@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page)  		bad_reason = "nonzero mapcount";  	if (unlikely(page->mapping != NULL))  		bad_reason = "non-NULL mapping"; -	if (unlikely(atomic_read(&page->_count) != 0)) +	if (unlikely(page_ref_count(page) != 0))  		bad_reason = "nonzero _count";  	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {  		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)  					   PAGE_SIZE << order);  	}  	arch_free_page(page, order); +	kernel_poison_pages(page, 1 << order, 0);  	kernel_map_pages(page, 1 << order, 0);  	return true; @@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,  	return __free_pages_boot_core(page, pfn, order);  } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, +				     unsigned long end_pfn, struct zone *zone) +{ +	struct page *start_page; +	struct page *end_page; + +	/* end_pfn is one past the range we are checking */ +	end_pfn--; + +	if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) +		return NULL; + +	start_page = pfn_to_page(start_pfn); + +	if (page_zone(start_page) != zone) +		return NULL; + +	end_page = pfn_to_page(end_pfn); + +	/* This gives a shorter code than deriving page_zone(end_page) */ +	if (page_zone_id(start_page) != page_zone_id(end_page)) +		return NULL; + +	return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ +	unsigned long block_start_pfn = zone->zone_start_pfn; +	unsigned long block_end_pfn; + +	block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); +	for (; block_start_pfn < zone_end_pfn(zone); +			block_start_pfn = block_end_pfn, +			 block_end_pfn += pageblock_nr_pages) { + +		block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + +		if (!__pageblock_pfn_to_page(block_start_pfn, +					     block_end_pfn, zone)) +			return; +	} + +	/* We confirm that there is no hole */ +	zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ +	zone->contiguous = false; +} +  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT  static void __init deferred_free_range(struct page *page,  					unsigned long pfn, int nr_pages) @@ -1254,9 +1376,13 @@ free_range:  	pgdat_init_report_one_done();  	return 0;  } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */  void __init page_alloc_init_late(void)  { +	struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT  	int nid;  	/* There will be num_node_state(N_MEMORY) threads */ @@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void)  	/* Reinit limits that are based on free pages after the kernel is up */  	files_maxfiles_init(); +#endif + +	for_each_populated_zone(zone) +		set_zone_contiguous(zone);  } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */  #ifdef CONFIG_CMA  /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ @@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page)  		bad_reason = "nonzero mapcount";  	if (unlikely(page->mapping != NULL))  		bad_reason = "non-NULL mapping"; -	if (unlikely(atomic_read(&page->_count) != 0)) +	if (unlikely(page_ref_count(page) != 0))  		bad_reason = "nonzero _count";  	if (unlikely(page->flags & __PG_HWPOISON)) {  		bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page)  	return 0;  } +static inline bool free_pages_prezeroed(bool poisoned) +{ +	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && +		page_poisoning_enabled() && poisoned; +} +  static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,  								int alloc_flags)  {  	int i; +	bool poisoned = true;  	for (i = 0; i < (1 << order); i++) {  		struct page *p = page + i;  		if (unlikely(check_new_page(p)))  			return 1; +		if (poisoned) +			poisoned &= page_is_poisoned(p);  	}  	set_page_private(page, 0); @@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,  	arch_alloc_page(page, order);  	kernel_map_pages(page, 1 << order, 1); +	kernel_poison_pages(page, 1 << order, 1);  	kasan_alloc_pages(page, order); -	if (gfp_flags & __GFP_ZERO) +	if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))  		for (i = 0; i < (1 << order); i++)  			clear_highpage(page + i); @@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,  		list_del(&page->lru);  		pcp->count--;  	} else { -		if (unlikely(gfp_flags & __GFP_NOFAIL)) { -			/* -			 * __GFP_NOFAIL is not to be used in new code. -			 * -			 * All __GFP_NOFAIL callers should be fixed so that they -			 * properly detect and handle allocation failures. -			 * -			 * We most definitely don't want callers attempting to -			 * allocate greater than order-1 page units with -			 * __GFP_NOFAIL. -			 */ -			WARN_ON_ONCE(order > 1); -		} +		/* +		 * We most definitely don't want callers attempting to +		 * allocate greater than order-1 page units with __GFP_NOFAIL. +		 */ +		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));  		spin_lock_irqsave(&zone->lock, flags);  		page = NULL; @@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)  		va_end(args);  	} -	pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", -		current->comm, order, gfp_mask); - +	pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", +		current->comm, order, gfp_mask, &gfp_mask);  	dump_stack();  	if (!should_suppress_show_mem())  		show_mem(filter); @@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  			 * XXX: Page reclaim didn't yield anything,  			 * and the OOM killer can't be invoked, but  			 * keep looping as per tradition. +			 * +			 * But do not keep looping if oom_killer_disable() +			 * was already called, for the system is trying to +			 * enter a quiescent state during suspend.  			 */ -			*did_some_progress = 1; +			*did_some_progress = !oom_killer_disabled;  			goto out;  		}  		if (pm_suspended_storage()) @@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))  		gfp_mask &= ~__GFP_ATOMIC; -	/* -	 * If this allocation cannot block and it is for a specific node, then -	 * fail early.  There's no need to wakeup kswapd or retry for a -	 * speculative node-specific allocation. -	 */ -	if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) -		goto nopage; -  retry:  	if (gfp_mask & __GFP_KSWAPD_RECLAIM)  		wake_all_kswapds(order, ac); @@ -3372,7 +3498,7 @@ refill:  		/* Even if we own the page, we do not use atomic_set().  		 * This would break get_page_unless_zero() users.  		 */ -		atomic_add(size - 1, &page->_count); +		page_ref_add(page, size - 1);  		/* reset page count bias and offset to start of new frag */  		nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3384,7 +3510,7 @@ refill:  	if (unlikely(offset < 0)) {  		page = virt_to_page(nc->va); -		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) +		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))  			goto refill;  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3392,7 +3518,7 @@ refill:  		size = nc->size;  #endif  		/* OK, page count is 0, we can safely set it */ -		atomic_set(&page->_count, size); +		set_page_count(page, size);  		/* reset page count bias and offset to start of new frag */  		nc->pagecnt_bias = size; @@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone)  		printk("Node %d ", zone_to_nid(zone));  } +long si_mem_available(void) +{ +	long available; +	unsigned long pagecache; +	unsigned long wmark_low = 0; +	unsigned long pages[NR_LRU_LISTS]; +	struct zone *zone; +	int lru; + +	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) +		pages[lru] = global_page_state(NR_LRU_BASE + lru); + +	for_each_zone(zone) +		wmark_low += zone->watermark[WMARK_LOW]; + +	/* +	 * Estimate the amount of memory available for userspace allocations, +	 * without causing swapping. +	 */ +	available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + +	/* +	 * Not all the page cache can be freed, otherwise the system will +	 * start swapping. Assume at least half of the page cache, or the +	 * low watermark worth of cache, needs to stay. +	 */ +	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; +	pagecache -= min(pagecache / 2, wmark_low); +	available += pagecache; + +	/* +	 * Part of the reclaimable slab consists of items that are in use, +	 * and cannot be freed. Cap this estimate at the low watermark. +	 */ +	available += global_page_state(NR_SLAB_RECLAIMABLE) - +		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + +	if (available < 0) +		available = 0; +	return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); +  void si_meminfo(struct sysinfo *val)  {  	val->totalram = totalram_pages; @@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s)  	} else if (*s == 'z' || *s == 'Z') {  		user_zonelist_order = ZONELIST_ORDER_ZONE;  	} else { -		printk(KERN_WARNING -			"Ignoring invalid numa_zonelist_order value:  " -			"%s\n", s); +		pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);  		return -EINVAL;  	}  	return 0; @@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)  	else  		page_group_by_mobility_disabled = 0; -	pr_info("Built %i zonelists in %s order, mobility grouping %s.  " -		"Total pages: %ld\n", -			nr_online_nodes, -			zonelist_order_name[current_zonelist_order], -			page_group_by_mobility_disabled ? "off" : "on", -			vm_total_pages); +	pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n", +		nr_online_nodes, +		zonelist_order_name[current_zonelist_order], +		page_group_by_mobility_disabled ? "off" : "on", +		vm_total_pages);  #ifdef CONFIG_NUMA  	pr_info("Policy zone: %s\n", zone_names[policy_zone]);  #endif @@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  	pg_data_t *pgdat = NODE_DATA(nid);  	unsigned long pfn;  	unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +	struct memblock_region *r = NULL, *tmp; +#endif  	if (highest_memmap_pfn < end_pfn - 1)  		highest_memmap_pfn = end_pfn - 1; @@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {  		/* -		 * There can be holes in boot-time mem_map[]s -		 * handed to this function.  They do not -		 * exist on hotplugged memory. +		 * There can be holes in boot-time mem_map[]s handed to this +		 * function.  They do not exist on hotplugged memory. +		 */ +		if (context != MEMMAP_EARLY) +			goto not_early; + +		if (!early_pfn_valid(pfn)) +			continue; +		if (!early_pfn_in_nid(pfn, nid)) +			continue; +		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) +			break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +		/* +		 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range +		 * from zone_movable_pfn[nid] to end of each node should be +		 * ZONE_MOVABLE not ZONE_NORMAL. skip it.  		 */ -		if (context == MEMMAP_EARLY) { -			if (!early_pfn_valid(pfn)) +		if (!mirrored_kernelcore && zone_movable_pfn[nid]) +			if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])  				continue; -			if (!early_pfn_in_nid(pfn, nid)) + +		/* +		 * Check given memblock attribute by firmware which can affect +		 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is +		 * mirrored, it's an overlapped memmap init. skip it. +		 */ +		if (mirrored_kernelcore && zone == ZONE_MOVABLE) { +			if (!r || pfn >= memblock_region_memory_end_pfn(r)) { +				for_each_memblock(memory, tmp) +					if (pfn < memblock_region_memory_end_pfn(tmp)) +						break; +				r = tmp; +			} +			if (pfn >= memblock_region_memory_base_pfn(r) && +			    memblock_is_mirror(r)) { +				/* already initialized as NORMAL */ +				pfn = memblock_region_memory_end_pfn(r);  				continue; -			if (!update_defer_init(pgdat, pfn, end_pfn, -						&nr_initialised)) -				break; +			}  		} +#endif +not_early:  		/*  		 * Mark the block movable so that blocks are reserved for  		 * movable at startup. This will force kernel allocations @@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,  			*zone_end_pfn = min(node_end_pfn,  				arch_zone_highest_possible_pfn[movable_zone]); -		/* Adjust for ZONE_MOVABLE starting within this range */ -		} else if (*zone_start_pfn < zone_movable_pfn[nid] && -				*zone_end_pfn > zone_movable_pfn[nid]) { -			*zone_end_pfn = zone_movable_pfn[nid]; -  		/* Check if this whole range is within ZONE_MOVABLE */  		} else if (*zone_start_pfn >= zone_movable_pfn[nid])  			*zone_start_pfn = *zone_end_pfn; @@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,  					unsigned long zone_type,  					unsigned long node_start_pfn,  					unsigned long node_end_pfn, +					unsigned long *zone_start_pfn, +					unsigned long *zone_end_pfn,  					unsigned long *ignored)  { -	unsigned long zone_start_pfn, zone_end_pfn; -  	/* When hotadd a new node from cpu_up(), the node should be empty */  	if (!node_start_pfn && !node_end_pfn)  		return 0;  	/* Get the start and end of the zone */ -	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; -	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; +	*zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; +	*zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];  	adjust_zone_range_for_zone_movable(nid, zone_type,  				node_start_pfn, node_end_pfn, -				&zone_start_pfn, &zone_end_pfn); +				zone_start_pfn, zone_end_pfn);  	/* Check that this node has pages within the zone's required range */ -	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) +	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)  		return 0;  	/* Move the zone boundaries inside the node if necessary */ -	zone_end_pfn = min(zone_end_pfn, node_end_pfn); -	zone_start_pfn = max(zone_start_pfn, node_start_pfn); +	*zone_end_pfn = min(*zone_end_pfn, node_end_pfn); +	*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);  	/* Return the spanned pages */ -	return zone_end_pfn - zone_start_pfn; +	return *zone_end_pfn - *zone_start_pfn;  }  /* @@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,  	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];  	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];  	unsigned long zone_start_pfn, zone_end_pfn; +	unsigned long nr_absent;  	/* When hotadd a new node from cpu_up(), the node should be empty */  	if (!node_start_pfn && !node_end_pfn) @@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,  	adjust_zone_range_for_zone_movable(nid, zone_type,  			node_start_pfn, node_end_pfn,  			&zone_start_pfn, &zone_end_pfn); -	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + +	/* +	 * ZONE_MOVABLE handling. +	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages +	 * and vice versa. +	 */ +	if (zone_movable_pfn[nid]) { +		if (mirrored_kernelcore) { +			unsigned long start_pfn, end_pfn; +			struct memblock_region *r; + +			for_each_memblock(memory, r) { +				start_pfn = clamp(memblock_region_memory_base_pfn(r), +						  zone_start_pfn, zone_end_pfn); +				end_pfn = clamp(memblock_region_memory_end_pfn(r), +						zone_start_pfn, zone_end_pfn); + +				if (zone_type == ZONE_MOVABLE && +				    memblock_is_mirror(r)) +					nr_absent += end_pfn - start_pfn; + +				if (zone_type == ZONE_NORMAL && +				    !memblock_is_mirror(r)) +					nr_absent += end_pfn - start_pfn; +			} +		} else { +			if (zone_type == ZONE_NORMAL) +				nr_absent += node_end_pfn - zone_movable_pfn[nid]; +		} +	} + +	return nr_absent;  }  #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,  					unsigned long zone_type,  					unsigned long node_start_pfn,  					unsigned long node_end_pfn, +					unsigned long *zone_start_pfn, +					unsigned long *zone_end_pfn,  					unsigned long *zones_size)  { +	unsigned int zone; + +	*zone_start_pfn = node_start_pfn; +	for (zone = 0; zone < zone_type; zone++) +		*zone_start_pfn += zones_size[zone]; + +	*zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; +  	return zones_size[zone_type];  } @@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,  	for (i = 0; i < MAX_NR_ZONES; i++) {  		struct zone *zone = pgdat->node_zones + i; +		unsigned long zone_start_pfn, zone_end_pfn;  		unsigned long size, real_size;  		size = zone_spanned_pages_in_node(pgdat->node_id, i,  						  node_start_pfn,  						  node_end_pfn, +						  &zone_start_pfn, +						  &zone_end_pfn,  						  zones_size);  		real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,  						  node_start_pfn, node_end_pfn,  						  zholes_size); +		if (size) +			zone->zone_start_pfn = zone_start_pfn; +		else +			zone->zone_start_pfn = 0;  		zone->spanned_pages = size;  		zone->present_pages = real_size; @@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  {  	enum zone_type j;  	int nid = pgdat->node_id; -	unsigned long zone_start_pfn = pgdat->node_start_pfn;  	int ret;  	pgdat_resize_init(pgdat); @@ -5217,11 +5461,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  #endif  	init_waitqueue_head(&pgdat->kswapd_wait);  	init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION +	init_waitqueue_head(&pgdat->kcompactd_wait); +#endif  	pgdat_page_ext_init(pgdat);  	for (j = 0; j < MAX_NR_ZONES; j++) {  		struct zone *zone = pgdat->node_zones + j;  		unsigned long size, realsize, freesize, memmap_pages; +		unsigned long zone_start_pfn = zone->zone_start_pfn;  		size = zone->spanned_pages;  		realsize = freesize = zone->present_pages; @@ -5240,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  					       "  %s zone: %lu pages used for memmap\n",  					       zone_names[j], memmap_pages);  			} else -				printk(KERN_WARNING -					"  %s zone: %lu pages exceeds freesize %lu\n", +				pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",  					zone_names[j], memmap_pages, freesize);  		} @@ -5290,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  		ret = init_currently_empty_zone(zone, zone_start_pfn, size);  		BUG_ON(ret);  		memmap_init(size, nid, j, zone_start_pfn); -		zone_start_pfn += size;  	}  } @@ -5358,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,  		(u64)start_pfn << PAGE_SHIFT,  		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else +	start_pfn = node_start_pfn;  #endif  	calculate_node_totalpages(pgdat, start_pfn, end_pfn,  				  zones_size, zholes_size); @@ -5448,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)  		min_pfn = min(min_pfn, start_pfn);  	if (min_pfn == ULONG_MAX) { -		printk(KERN_WARNING -			"Could not find start_pfn for node %d\n", nid); +		pr_warn("Could not find start_pfn for node %d\n", nid);  		return 0;  	} @@ -5529,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)  	}  	/* +	 * If kernelcore=mirror is specified, ignore movablecore option +	 */ +	if (mirrored_kernelcore) { +		bool mem_below_4gb_not_mirrored = false; + +		for_each_memblock(memory, r) { +			if (memblock_is_mirror(r)) +				continue; + +			nid = r->nid; + +			usable_startpfn = memblock_region_memory_base_pfn(r); + +			if (usable_startpfn < 0x100000) { +				mem_below_4gb_not_mirrored = true; +				continue; +			} + +			zone_movable_pfn[nid] = zone_movable_pfn[nid] ? +				min(usable_startpfn, zone_movable_pfn[nid]) : +				usable_startpfn; +		} + +		if (mem_below_4gb_not_mirrored) +			pr_warn("This configuration results in unmirrored kernel memory."); + +		goto out2; +	} + +	/*  	 * If movablecore=nn[KMG] was specified, calculate what size of  	 * kernelcore that corresponds so that memory usable for  	 * any allocation type is evenly spread. If both kernelcore @@ -5788,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)   */  static int __init cmdline_parse_kernelcore(char *p)  { +	/* parse kernelcore=mirror */ +	if (parse_option_str(p, "mirror")) { +		mirrored_kernelcore = true; +		return 0; +	} +  	return cmdline_parse_core(p, &required_kernelcore);  } @@ -5885,22 +6168,21 @@ void __init mem_init_print_info(const char *str)  #undef	adj_init_size -	pr_info("Memory: %luK/%luK available " -	       "(%luK kernel code, %luK rwdata, %luK rodata, " -	       "%luK init, %luK bss, %luK reserved, %luK cma-reserved" +	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"  #ifdef	CONFIG_HIGHMEM -	       ", %luK highmem" +		", %luK highmem"  #endif -	       "%s%s)\n", -	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), -	       codesize >> 10, datasize >> 10, rosize >> 10, -	       (init_data_size + init_code_size) >> 10, bss_size >> 10, -	       (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), -	       totalcma_pages << (PAGE_SHIFT-10), +		"%s%s)\n", +		nr_free_pages() << (PAGE_SHIFT - 10), +		physpages << (PAGE_SHIFT - 10), +		codesize >> 10, datasize >> 10, rosize >> 10, +		(init_data_size + init_code_size) >> 10, bss_size >> 10, +		(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), +		totalcma_pages << (PAGE_SHIFT - 10),  #ifdef	CONFIG_HIGHMEM -	       totalhigh_pages << (PAGE_SHIFT-10), +		totalhigh_pages << (PAGE_SHIFT - 10),  #endif -	       str ? ", " : "", str ? str : ""); +		str ? ", " : "", str ? str : "");  }  /** @@ -6075,8 +6357,17 @@ static void __setup_per_zone_wmarks(void)  			zone->watermark[WMARK_MIN] = tmp;  		} -		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2); -		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); +		/* +		 * Set the kswapd watermarks distance according to the +		 * scale factor in proportion to available memory, but +		 * ensure a minimum size on small systems. +		 */ +		tmp = max_t(u64, tmp >> 2, +			    mult_frac(zone->managed_pages, +				      watermark_scale_factor, 10000)); + +		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp; +		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;  		__mod_zone_page_state(zone, NR_ALLOC_BATCH,  			high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6217,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,  	return 0;  } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +	void __user *buffer, size_t *length, loff_t *ppos) +{ +	int rc; + +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos); +	if (rc) +		return rc; + +	if (write) +		setup_per_zone_wmarks(); + +	return 0; +} +  #ifdef CONFIG_NUMA  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos) @@ -6408,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename,  	if (!table)  		panic("Failed to allocate %s hash table\n", tablename); -	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", -	       tablename, -	       (1UL << log2qty), -	       ilog2(size) - PAGE_SHIFT, -	       size); +	pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", +		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);  	if (_hash_shift)  		*_hash_shift = log2qty; @@ -6563,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,  		 * This check already skips compound tails of THP  		 * because their page->_count is zero at all time.  		 */ -		if (!atomic_read(&page->_count)) { +		if (!page_ref_count(page)) {  			if (PageBuddy(page))  				iter += (1 << page_order(page)) - 1;  			continue; @@ -6913,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  		BUG_ON(!PageBuddy(page));  		order = page_order(page);  #ifdef CONFIG_DEBUG_VM -		printk(KERN_INFO "remove from free list %lx %d %lx\n", -		       pfn, 1 << order, end_pfn); +		pr_info("remove from free list %lx %d %lx\n", +			pfn, 1 << order, end_pfn);  #endif  		list_del(&page->lru);  		rmv_page_order(page); @@ -6927,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  }  #endif -#ifdef CONFIG_MEMORY_FAILURE  bool is_free_buddy_page(struct page *page)  {  	struct zone *zone = page_zone(page); @@ -6946,4 +7248,3 @@ bool is_free_buddy_page(struct page *page)  	return order < MAX_ORDER;  } -#endif  | 
