diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 325 | 
1 files changed, 177 insertions, 148 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44030096da63..009ac285fea7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,7 +51,6 @@  #include <linux/page_cgroup.h>  #include <linux/debugobjects.h>  #include <linux/kmemleak.h> -#include <linux/memory.h>  #include <linux/compaction.h>  #include <trace/events/kmem.h>  #include <linux/ftrace_event.h> @@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);  int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +/* + * NOTE: + * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. + * Instead, use {un}set_pageblock_isolate. + */ +void set_pageblock_migratetype(struct page *page, int migratetype)  {  	if (unlikely(page_group_by_mobility_disabled)) @@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,  	return pages_moved;  } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page,  				int migratetype)  {  	unsigned long start_pfn, end_pfn; @@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)  		to_drain = pcp->batch;  	else  		to_drain = pcp->count; -	free_pcppages_bulk(zone, to_drain, pcp); -	pcp->count -= to_drain; +	if (to_drain > 0) { +		free_pcppages_bulk(zone, to_drain, pcp); +		pcp->count -= to_drain; +	}  	local_irq_restore(flags);  }  #endif @@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)  }  __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)  {  	if (order < fail_page_alloc.min_order) -		return 0; +		return false;  	if (gfp_mask & __GFP_NOFAIL) -		return 0; +		return false;  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) -		return 0; +		return false;  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) -		return 0; +		return false;  	return should_fail(&fail_page_alloc.attr, 1 << order);  } @@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);  #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)  { -	return 0; +	return false;  }  #endif /* CONFIG_FAIL_PAGE_ALLOC */ @@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,  {  	/* free_pages my go negative - that's OK */  	long min = mark; +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];  	int o;  	free_pages -= (1 << order) - 1; @@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,  	if (alloc_flags & ALLOC_HARDER)  		min -= min / 4; -	if (free_pages <= min + z->lowmem_reserve[classzone_idx]) +	if (free_pages <= min + lowmem_reserve)  		return false;  	for (o = 0; o < order; o++) {  		/* At the next order, this order's pages become unavailable */ @@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,  	return true;  } +#ifdef CONFIG_MEMORY_ISOLATION +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ +	if (unlikely(zone->nr_pageblock_isolate)) +		return zone->nr_pageblock_isolate * pageblock_nr_pages; +	return 0; +} +#else +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ +	return 0; +} +#endif +  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,  		      int classzone_idx, int alloc_flags)  { @@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,  	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)  		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); +	/* +	 * If the zone has MIGRATE_ISOLATE type free pages, we should consider +	 * it.  nr_zone_isolate_freepages is never accurate so kswapd might not +	 * sleep although it could do so.  But this is more desirable for memory +	 * hotplug than sleeping which can cause a livelock in the direct +	 * reclaim path. +	 */ +	free_pages -= nr_zone_isolate_freepages(z);  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,  								free_pages);  } @@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		page = get_page_from_freelist(gfp_mask, nodemask,  				order, zonelist, high_zoneidx, -				alloc_flags, preferred_zone, -				migratetype); +				alloc_flags & ~ALLOC_NO_WATERMARKS, +				preferred_zone, migratetype);  		if (page) {  			preferred_zone->compact_considered = 0;  			preferred_zone->compact_defer_shift = 0; @@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,  retry:  	page = get_page_from_freelist(gfp_mask, nodemask, order,  					zonelist, high_zoneidx, -					alloc_flags, preferred_zone, -					migratetype); +					alloc_flags & ~ALLOC_NO_WATERMARKS, +					preferred_zone, migratetype);  	/*  	 * If an allocation failed after direct reclaim, it could be because @@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)  		alloc_flags |= ALLOC_HARDER;  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { -		if (!in_interrupt() && -		    ((current->flags & PF_MEMALLOC) || -		     unlikely(test_thread_flag(TIF_MEMDIE)))) +		if (gfp_mask & __GFP_MEMALLOC) +			alloc_flags |= ALLOC_NO_WATERMARKS; +		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) +			alloc_flags |= ALLOC_NO_WATERMARKS; +		else if (!in_interrupt() && +				((current->flags & PF_MEMALLOC) || +				 unlikely(test_thread_flag(TIF_MEMDIE))))  			alloc_flags |= ALLOC_NO_WATERMARKS;  	}  	return alloc_flags;  } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ +	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} +  static inline struct page *  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2340,11 +2378,27 @@ rebalance:  	/* Allocate without watermarks if the context allows */  	if (alloc_flags & ALLOC_NO_WATERMARKS) { +		/* +		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds +		 * the allocation is high priority and these type of +		 * allocations are system rather than user orientated +		 */ +		zonelist = node_zonelist(numa_node_id(), gfp_mask); +  		page = __alloc_pages_high_priority(gfp_mask, order,  				zonelist, high_zoneidx, nodemask,  				preferred_zone, migratetype); -		if (page) +		if (page) { +			/* +			 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was +			 * necessary to allocate the page. The expectation is +			 * that the caller is taking steps that will free more +			 * memory. The caller should avoid the page being used +			 * for !PFMEMALLOC purposes. +			 */ +			page->pfmemalloc = true;  			goto got_pg; +		}  	}  	/* Atomic allocations - we can't balance anything */ @@ -2463,8 +2517,8 @@ nopage:  got_pg:  	if (kmemcheck_enabled)  		kmemcheck_pagealloc_alloc(page, order, gfp_mask); -	return page; +	return page;  }  /* @@ -2515,6 +2569,8 @@ retry_cpuset:  		page = __alloc_pages_slowpath(gfp_mask, order,  				zonelist, high_zoneidx, nodemask,  				preferred_zone, migratetype); +	else +		page->pfmemalloc = false;  	trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,  			user_zonelist_order = oldval;  		} else if (oldval != user_zonelist_order) {  			mutex_lock(&zonelists_mutex); -			build_all_zonelists(NULL); +			build_all_zonelists(NULL, NULL);  			mutex_unlock(&zonelists_mutex);  		}  	} @@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);  DEFINE_MUTEX(zonelists_mutex);  /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data)  {  	int nid;  	int cpu; +	pg_data_t *self = data;  #ifdef CONFIG_NUMA  	memset(node_load, 0, sizeof(node_load));  #endif + +	if (self && !node_online(self->node_id)) { +		build_zonelists(self); +		build_zonelist_cache(self); +	} +  	for_each_online_node(nid) {  		pg_data_t *pgdat = NODE_DATA(nid); @@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)   * Called with zonelists_mutex held always   * unless system_state == SYSTEM_BOOTING.   */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)  {  	set_zonelist_order(); @@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)  		/* we have to stop all cpus to guarantee there is no user  		   of zonelist */  #ifdef CONFIG_MEMORY_HOTPLUG -		if (data) -			setup_zone_pageset((struct zone *)data); +		if (zone) +			setup_zone_pageset(zone);  #endif -		stop_machine(__build_all_zonelists, NULL, NULL); +		stop_machine(__build_all_zonelists, pgdat, NULL);  		/* cpuset refresh routine should be here */  	}  	vm_total_pages = nr_free_pagecache_pages(); @@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)  #endif -static int zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone)  {  #ifdef CONFIG_MMU  	int batch; @@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,  		pcp->batch = PAGE_SHIFT * 8;  } -static void setup_zone_pageset(struct zone *zone) +static void __meminit setup_zone_pageset(struct zone *zone)  {  	int cpu; @@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  	return 0;  } -static int __zone_pcp_update(void *data) -{ -	struct zone *zone = data; -	int cpu; -	unsigned long batch = zone_batchsize(zone), flags; - -	for_each_possible_cpu(cpu) { -		struct per_cpu_pageset *pset; -		struct per_cpu_pages *pcp; - -		pset = per_cpu_ptr(zone->pageset, cpu); -		pcp = &pset->pcp; - -		local_irq_save(flags); -		free_pcppages_bulk(zone, pcp->count, pcp); -		setup_pageset(pset, batch); -		local_irq_restore(flags); -	} -	return 0; -} - -void zone_pcp_update(struct zone *zone) -{ -	stop_machine(__zone_pcp_update, zone, NULL); -} -  static __meminit void zone_pcp_init(struct zone *zone)  {  	/* @@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)  					 zone_batchsize(zone));  } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone,  					unsigned long zone_start_pfn,  					unsigned long size,  					enum memmap_context context) @@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(void) +void __init set_pageblock_order(void)  {  	unsigned int order; @@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)   * include/linux/pageblock-flags.h for the values of pageblock_order based on   * the kernel config   */ -static inline void set_pageblock_order(void) +void __init set_pageblock_order(void)  {  } @@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)   *   - mark all pages reserved   *   - mark all memory queues empty   *   - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller.   */  static void __paginginit free_area_init_core(struct pglist_data *pgdat,  		unsigned long *zones_size, unsigned long *zholes_size) @@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  	int ret;  	pgdat_resize_init(pgdat); -	pgdat->nr_zones = 0;  	init_waitqueue_head(&pgdat->kswapd_wait); -	pgdat->kswapd_max_order = 0; +	init_waitqueue_head(&pgdat->pfmemalloc_wait);  	pgdat_page_cgroup_init(pgdat);  	for (j = 0; j < MAX_NR_ZONES; j++) { @@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  		zone->spanned_pages = size;  		zone->present_pages = realsize; +#if defined CONFIG_COMPACTION || defined CONFIG_CMA +		zone->compact_cached_free_pfn = zone->zone_start_pfn + +						zone->spanned_pages; +		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); +#endif  #ifdef CONFIG_NUMA  		zone->node = nid;  		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) @@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  		zone_pcp_init(zone);  		lruvec_init(&zone->lruvec, zone); -		zap_zone_vm_stats(zone); -		zone->flags = 0;  		if (!size)  			continue; @@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  {  	pg_data_t *pgdat = NODE_DATA(nid); +	/* pg_data_t should be reset to zero when it's allocated */ +	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); +  	pgdat->node_id = nid;  	pgdat->node_start_pfn = node_start_pfn;  	calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -4750,7 +4794,7 @@ out:  }  /* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +static void __init check_for_regular_memory(pg_data_t *pgdat)  {  #ifdef CONFIG_HIGHMEM  	enum zone_type zone_type; @@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,  }  /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check wihtout isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact.   */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count)  {  	unsigned long pfn, iter, found;  	int mt;  	/*  	 * For avoiding noise data, lru_add_drain_all() should be called -	 * If ZONE_MOVABLE, the zone never contains immobile pages +	 * If ZONE_MOVABLE, the zone never contains unmovable pages  	 */  	if (zone_idx(zone) == ZONE_MOVABLE) -		return true; +		return false;  	mt = get_pageblock_migratetype(page);  	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) -		return true; +		return false;  	pfn = page_to_pfn(page);  	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { @@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)  			continue;  		page = pfn_to_page(check); -		if (!page_count(page)) { +		/* +		 * We can't use page_count without pin a page +		 * because another CPU can free compound page. +		 * This check already skips compound tails of THP +		 * because their page->_count is zero at all time. +		 */ +		if (!atomic_read(&page->_count)) {  			if (PageBuddy(page))  				iter += (1 << page_order(page)) - 1;  			continue;  		} +  		if (!PageLRU(page))  			found++;  		/* @@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)  		 * page at boot.  		 */  		if (found > count) -			return false; +			return true;  	} -	return true; +	return false;  }  bool is_pageblock_removable_nolock(struct page *page) @@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)  			zone->zone_start_pfn + zone->spanned_pages <= pfn)  		return false; -	return __count_immobile_pages(zone, page, 0); -} - -int set_migratetype_isolate(struct page *page) -{ -	struct zone *zone; -	unsigned long flags, pfn; -	struct memory_isolate_notify arg; -	int notifier_ret; -	int ret = -EBUSY; - -	zone = page_zone(page); - -	spin_lock_irqsave(&zone->lock, flags); - -	pfn = page_to_pfn(page); -	arg.start_pfn = pfn; -	arg.nr_pages = pageblock_nr_pages; -	arg.pages_found = 0; - -	/* -	 * It may be possible to isolate a pageblock even if the -	 * migratetype is not MIGRATE_MOVABLE. The memory isolation -	 * notifier chain is used by balloon drivers to return the -	 * number of pages in a range that are held by the balloon -	 * driver to shrink memory. If all the pages are accounted for -	 * by balloons, are free, or on the LRU, isolation can continue. -	 * Later, for example, when memory hotplug notifier runs, these -	 * pages reported as "can be isolated" should be isolated(freed) -	 * by the balloon driver through the memory notifier chain. -	 */ -	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); -	notifier_ret = notifier_to_errno(notifier_ret); -	if (notifier_ret) -		goto out; -	/* -	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. -	 * We just check MOVABLE pages. -	 */ -	if (__count_immobile_pages(zone, page, arg.pages_found)) -		ret = 0; - -	/* -	 * immobile means "not-on-lru" paes. If immobile is larger than -	 * removable-by-driver pages reported by notifier, we'll fail. -	 */ - -out: -	if (!ret) { -		set_pageblock_migratetype(page, MIGRATE_ISOLATE); -		move_freepages_block(zone, page, MIGRATE_ISOLATE); -	} - -	spin_unlock_irqrestore(&zone->lock, flags); -	if (!ret) -		drain_all_pages(); -	return ret; -} - -void unset_migratetype_isolate(struct page *page, unsigned migratetype) -{ -	struct zone *zone; -	unsigned long flags; -	zone = page_zone(page); -	spin_lock_irqsave(&zone->lock, flags); -	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) -		goto out; -	set_pageblock_migratetype(page, migratetype); -	move_freepages_block(zone, page, migratetype); -out: -	spin_unlock_irqrestore(&zone->lock, flags); +	return !has_unmovable_pages(zone, page, 0);  }  #ifdef CONFIG_CMA @@ -5635,7 +5617,12 @@ static struct page *  __alloc_contig_migrate_alloc(struct page *page, unsigned long private,  			     int **resultp)  { -	return alloc_page(GFP_HIGHUSER_MOVABLE); +	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + +	if (PageHighMem(page)) +		gfp_mask |= __GFP_HIGHMEM; + +	return alloc_page(gfp_mask);  }  /* [start, end) must belong to a single zone. */ @@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)  }  #endif +#ifdef CONFIG_MEMORY_HOTPLUG +static int __meminit __zone_pcp_update(void *data) +{ +	struct zone *zone = data; +	int cpu; +	unsigned long batch = zone_batchsize(zone), flags; + +	for_each_possible_cpu(cpu) { +		struct per_cpu_pageset *pset; +		struct per_cpu_pages *pcp; + +		pset = per_cpu_ptr(zone->pageset, cpu); +		pcp = &pset->pcp; + +		local_irq_save(flags); +		if (pcp->count > 0) +			free_pcppages_bulk(zone, pcp->count, pcp); +		setup_pageset(pset, batch); +		local_irq_restore(flags); +	} +	return 0; +} + +void __meminit zone_pcp_update(struct zone *zone) +{ +	stop_machine(__zone_pcp_update, zone, NULL); +} +#endif +  #ifdef CONFIG_MEMORY_HOTREMOVE +void zone_pcp_reset(struct zone *zone) +{ +	unsigned long flags; + +	/* avoid races with drain_pages()  */ +	local_irq_save(flags); +	if (zone->pageset != &boot_pageset) { +		free_percpu(zone->pageset); +		zone->pageset = &boot_pageset; +	} +	local_irq_restore(flags); +} +  /*   * All pages in the range must be isolated before calling this.   */  | 
