diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 818 |
1 files changed, 312 insertions, 506 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b8ba3aebf6e..0027d8f4a1bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/page-debug-flags.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void) saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -181,39 +197,17 @@ static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP - /* - * MAX_ACTIVE_REGIONS determines the maximum number of distinct - * ranges of memory (RAM) that may be registered with add_active_range(). - * Ranges passed to add_active_range() will be merged if possible - * so the number of times add_active_range() can be called is - * related to the number of nodes and the number of holes - */ - #ifdef CONFIG_MAX_ACTIVE_REGIONS - /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ - #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS - #else - #if MAX_NUMNODES >= 32 - /* If there can be many nodes, allow up to 50 holes per node */ - #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) - #else - /* By default, allow up to 256 distinct regions */ - #define MAX_ACTIVE_REGIONS 256 - #endif - #endif - - static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; - static int __meminitdata nr_nodemap_entries; - static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __initdata required_kernelcore; - static unsigned long __initdata required_movablecore; - static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; - - /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ - int movable_zone; - EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __initdata required_kernelcore; +static unsigned long __initdata required_movablecore; +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -333,8 +327,8 @@ out: * * The remaining PAGE_SIZE pages are called "tail pages". * - * All pages have PG_compound set. All pages have their ->private pointing at - * the head page (even the head page has this). + * All pages have PG_compound set. All tail pages have their ->first_page + * pointing at the head page. * * The first tail page's ->lru.next holds the address of the compound page's * put_page() function. Its ->lru.prev holds the order of allocation. @@ -403,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -460,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@ -516,11 +546,19 @@ static inline void __free_one_page(struct page *page, buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@ -654,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -692,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * permit the bootmem allocator to evade page validation on high-order frees - */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { - if (order == 0) { - __ClearPageReserved(page); - set_page_count(page, 0); - set_page_refcounted(page); - __free_page(page); - } else { - int loop; + unsigned int nr_pages = 1 << order; + unsigned int loop; - prefetchw(page); - for (loop = 0; loop < BITS_PER_LONG; loop++) { - struct page *p = &page[loop]; + prefetchw(page); + for (loop = 0; loop < nr_pages; loop++) { + struct page *p = &page[loop]; - if (loop + 1 < BITS_PER_LONG) - prefetchw(p + 1); - __ClearPageReserved(p); - set_page_count(p, 0); - } - - set_page_refcounted(page); - __free_pages(page, order); + if (loop + 1 < nr_pages) + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); } + + set_page_refcounted(page); + __free_pages(page, order); } @@ -746,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -1211,6 +1257,19 @@ out: } /* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_page_free_batched(page, cold); + free_hot_cold_page(page, cold); + } +} + +/* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] * Each sub-page must be freed individually. @@ -1408,7 +1467,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) static int __init fail_page_alloc_debugfs(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, @@ -1457,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) @@ -1667,6 +1726,35 @@ zonelist_scan: if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + /* + * When allocating a page cache page for writing, we + * want to get it from a zone that is within its dirty + * limit, such that no single zone holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the zone's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * This may look like it could increase pressure on + * lower zones by failing allocations in higher zones + * before they are full. But the pages that do spill + * over are limited as the lower zones are protected + * by this very same mechanism. It should not become + * a practical burden to them. + * + * XXX: For now, allow allocations to potentially + * exceed the per-zone dirty limit in the slowpath + * (ALLOC_WMARK_LOW unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * zones are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ + if ((alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + goto this_zone_full; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1756,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; /* @@ -1795,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@ -1819,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@ -1886,14 +1981,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -1921,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -1933,8 +2040,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2084,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2164,12 +2273,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2218,7 +2337,8 @@ rebalance: /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; @@ -2232,8 +2352,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } @@ -2328,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { @@ -3407,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* @@ -3737,35 +3856,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP -/* - * Basic iterator support. Return the first range of PFNs for a node - * Note: nid == MAX_NUMNODES returns first region regardless of node - */ -static int __meminit first_active_region_index_in_nid(int nid) -{ - int i; - - for (i = 0; i < nr_nodemap_entries; i++) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the next active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit next_active_region_index_in_nid(int index, int nid) -{ - for (index = index + 1; index < nr_nodemap_entries; index++) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3775,15 +3866,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) */ int __meminit __early_pfn_to_nid(unsigned long pfn) { - int i; - - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long start_pfn = early_node_map[i].start_pfn; - unsigned long end_pfn = early_node_map[i].end_pfn; + unsigned long start_pfn, end_pfn; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) if (start_pfn <= pfn && pfn < end_pfn) - return early_node_map[i].nid; - } + return nid; /* This is a memory hole */ return -1; } @@ -3812,11 +3900,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) } #endif -/* Basic iterator support to walk early_node_map[] */ -#define for_each_active_range_index_in_nid(i, nid) \ - for (i = first_active_region_index_in_nid(nid); i != -1; \ - i = next_active_region_index_in_nid(i, nid)) - /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3826,122 +3909,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * add_active_ranges() contain no holes and may be freed, this * this function may be used instead of calling free_bootmem() manually. */ -void __init free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn) -{ - int i; - - for_each_active_range_index_in_nid(i, nid) { - unsigned long size_pages = 0; - unsigned long end_pfn = early_node_map[i].end_pfn; - - if (early_node_map[i].start_pfn >= max_low_pfn) - continue; - - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; - - size_pages = end_pfn - early_node_map[i].start_pfn; - free_bootmem_node(NODE_DATA(early_node_map[i].nid), - PFN_PHYS(early_node_map[i].start_pfn), - size_pages << PAGE_SHIFT); - } -} - -#ifdef CONFIG_HAVE_MEMBLOCK -/* - * Basic iterator support. Return the last range of PFNs for a node - * Note: nid == MAX_NUMNODES returns last region regardless of node - */ -static int __meminit last_active_region_index_in_nid(int nid) +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { - int i; - - for (i = nr_nodemap_entries - 1; i >= 0; i--) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the previous active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit previous_active_region_index_in_nid(int index, int nid) -{ - for (index = index - 1; index >= 0; index--) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - -#define for_each_active_range_index_in_nid_reverse(i, nid) \ - for (i = last_active_region_index_in_nid(nid); i != -1; \ - i = previous_active_region_index_in_nid(i, nid)) - -u64 __init find_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - int i; - - /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid_reverse(i, nid) { - u64 addr; - u64 ei_start, ei_last; - u64 final_start, final_end; - - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - - final_start = max(ei_start, goal); - final_end = min(ei_last, limit); - - if (final_start >= final_end) - continue; - - addr = memblock_find_in_range(final_start, final_end, size, align); + unsigned long start_pfn, end_pfn; + int i, this_nid; - if (addr == MEMBLOCK_ERROR) - continue; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { + start_pfn = min(start_pfn, max_low_pfn); + end_pfn = min(end_pfn, max_low_pfn); - return addr; + if (start_pfn < end_pfn) + free_bootmem_node(NODE_DATA(this_nid), + PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT); } - - return MEMBLOCK_ERROR; } -#endif int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { + unsigned long start_pfn, end_pfn; int i; - u64 start, end; /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - start = early_node_map[i].start_pfn; - end = early_node_map[i].end_pfn; - nr_range = add_range(range, az, nr_range, start, end); - } + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) + nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); return nr_range; } -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) -{ - int i; - int ret; - - for_each_active_range_index_in_nid(i, nid) { - ret = work_fn(early_node_map[i].start_pfn, - early_node_map[i].end_pfn, data); - if (ret) - break; - } -} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -3952,12 +3947,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) */ void __init sparse_memory_present_with_active_regions(int nid) { - int i; + unsigned long start_pfn, end_pfn; + int i, this_nid; - for_each_active_range_index_in_nid(i, nid) - memory_present(early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); } /** @@ -3974,13 +3968,15 @@ void __init sparse_memory_present_with_active_regions(int nid) void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { + unsigned long this_start_pfn, this_end_pfn; int i; + *start_pfn = -1UL; *end_pfn = 0; - for_each_active_range_index_in_nid(i, nid) { - *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); - *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { + *start_pfn = min(*start_pfn, this_start_pfn); + *end_pfn = max(*end_pfn, this_end_pfn); } if (*start_pfn == -1UL) @@ -4083,46 +4079,16 @@ unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { - int i = 0; - unsigned long prev_end_pfn = 0, hole_pages = 0; - unsigned long start_pfn; - - /* Find the end_pfn of the first active range of pfns in the node */ - i = first_active_region_index_in_nid(nid); - if (i == -1) - return 0; - - prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - - /* Account for ranges before physical memory on this node */ - if (early_node_map[i].start_pfn > range_start_pfn) - hole_pages = prev_end_pfn - range_start_pfn; - - /* Find all holes for the zone within the node */ - for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { - - /* No need to continue if prev_end_pfn is outside the zone */ - if (prev_end_pfn >= range_end_pfn) - break; - - /* Make sure the end of the zone is not within the hole */ - start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - prev_end_pfn = max(prev_end_pfn, range_start_pfn); + unsigned long nr_absent = range_end_pfn - range_start_pfn; + unsigned long start_pfn, end_pfn; + int i; - /* Update the hole size cound and move on */ - if (start_pfn > range_start_pfn) { - BUG_ON(prev_end_pfn > start_pfn); - hole_pages += start_pfn - prev_end_pfn; - } - prev_end_pfn = early_node_map[i].end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + nr_absent -= end_pfn - start_pfn; } - - /* Account for ranges past physical memory on this node */ - if (range_end_pfn > prev_end_pfn) - hole_pages += range_end_pfn - - max(range_start_pfn, prev_end_pfn); - - return hole_pages; + return nr_absent; } /** @@ -4143,14 +4109,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); - zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], - node_start_pfn); - zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], - node_end_pfn); + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, @@ -4158,7 +4124,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -#else +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) @@ -4176,7 +4142,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4296,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list l; + enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4346,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) - INIT_LIST_HEAD(&zone->lru[l].list); + for_each_lru(lru) + INIT_LIST_HEAD(&zone->lruvec.lists[lru]); zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4399,10 +4365,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ @@ -4427,7 +4393,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 /* @@ -4449,170 +4415,6 @@ static inline void setup_nr_node_ids(void) #endif /** - * add_active_range - Register a range of PFNs backed by physical memory - * @nid: The node ID the range resides on - * @start_pfn: The start PFN of the available physical memory - * @end_pfn: The end PFN of the available physical memory - * - * These ranges are stored in an early_node_map[] and later used by - * free_area_init_nodes() to calculate zone sizes and holes. If the - * range spans a memory hole, it is up to the architecture to ensure - * the memory is not freed by the bootmem allocator. If possible - * the range being registered will be merged with existing ranges. - */ -void __init add_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i; - - mminit_dprintk(MMINIT_TRACE, "memory_register", - "Entering add_active_range(%d, %#lx, %#lx) " - "%d entries of %d used\n", - nid, start_pfn, end_pfn, - nr_nodemap_entries, MAX_ACTIVE_REGIONS); - - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - - /* Merge with existing active regions if possible */ - for (i = 0; i < nr_nodemap_entries; i++) { - if (early_node_map[i].nid != nid) - continue; - - /* Skip if an existing region covers this new one */ - if (start_pfn >= early_node_map[i].start_pfn && - end_pfn <= early_node_map[i].end_pfn) - return; - - /* Merge forward if suitable */ - if (start_pfn <= early_node_map[i].end_pfn && - end_pfn > early_node_map[i].end_pfn) { - early_node_map[i].end_pfn = end_pfn; - return; - } - - /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].start_pfn && - end_pfn >= early_node_map[i].start_pfn) { - early_node_map[i].start_pfn = start_pfn; - return; - } - } - - /* Check that early_node_map is large enough */ - if (i >= MAX_ACTIVE_REGIONS) { - printk(KERN_CRIT "More than %d memory regions, truncating\n", - MAX_ACTIVE_REGIONS); - return; - } - - early_node_map[i].nid = nid; - early_node_map[i].start_pfn = start_pfn; - early_node_map[i].end_pfn = end_pfn; - nr_nodemap_entries = i + 1; -} - -/** - * remove_active_range - Shrink an existing registered range of PFNs - * @nid: The node id the range is on that should be shrunk - * @start_pfn: The new PFN of the range - * @end_pfn: The new PFN of the range - * - * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept near the end physical page range that has already been - * registered. This function allows an arch to shrink an existing registered - * range. - */ -void __init remove_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i, j; - int removed = 0; - - printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn <= end_pfn) { - /* clear it */ - early_node_map[i].start_pfn = 0; - early_node_map[i].end_pfn = 0; - removed = 1; - continue; - } - if (early_node_map[i].start_pfn < start_pfn && - early_node_map[i].end_pfn > start_pfn) { - unsigned long temp_end_pfn = early_node_map[i].end_pfn; - early_node_map[i].end_pfn = start_pfn; - if (temp_end_pfn > end_pfn) - add_active_range(nid, end_pfn, temp_end_pfn); - continue; - } - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn > end_pfn && - early_node_map[i].start_pfn < end_pfn) { - early_node_map[i].start_pfn = end_pfn; - continue; - } - } - - if (!removed) - return; - - /* remove the blank ones */ - for (i = nr_nodemap_entries - 1; i > 0; i--) { - if (early_node_map[i].nid != nid) - continue; - if (early_node_map[i].end_pfn) - continue; - /* we found it, get rid of it */ - for (j = i; j < nr_nodemap_entries - 1; j++) - memcpy(&early_node_map[j], &early_node_map[j+1], - sizeof(early_node_map[j])); - j = nr_nodemap_entries - 1; - memset(&early_node_map[j], 0, sizeof(early_node_map[j])); - nr_nodemap_entries--; - } -} - -/** - * remove_all_active_ranges - Remove all currently registered regions - * - * During discovery, it may be found that a table like SRAT is invalid - * and an alternative discovery method must be used. This function removes - * all currently registered regions. - */ -void __init remove_all_active_ranges(void) -{ - memset(early_node_map, 0, sizeof(early_node_map)); - nr_nodemap_entries = 0; -} - -/* Compare two active node_active_regions */ -static int __init cmp_node_active_region(const void *a, const void *b) -{ - struct node_active_region *arange = (struct node_active_region *)a; - struct node_active_region *brange = (struct node_active_region *)b; - - /* Done this way to avoid overflows */ - if (arange->start_pfn > brange->start_pfn) - return 1; - if (arange->start_pfn < brange->start_pfn) - return -1; - - return 0; -} - -/* sort the node_map by start_pfn */ -void __init sort_node_map(void) -{ - sort(early_node_map, (size_t)nr_nodemap_entries, - sizeof(struct node_active_region), - cmp_node_active_region, NULL); -} - -/** * node_map_pfn_alignment - determine the maximum internode alignment * * This function should be called after node map is populated and sorted. @@ -4634,15 +4436,11 @@ void __init sort_node_map(void) unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; + unsigned long start, end, mask; int last_nid = -1; - int i; - - for_each_active_range_index_in_nid(i, MAX_NUMNODES) { - int nid = early_node_map[i].nid; - unsigned long start = early_node_map[i].start_pfn; - unsigned long end = early_node_map[i].end_pfn; - unsigned long mask; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (!start || last_nid < 0 || last_nid == nid) { last_nid = nid; last_end = end; @@ -4669,12 +4467,12 @@ unsigned long __init node_map_pfn_alignment(void) /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { - int i; unsigned long min_pfn = ULONG_MAX; + unsigned long start_pfn; + int i; - /* Assuming a sorted map, the first range found has the starting pfn */ - for_each_active_range_index_in_nid(i, nid) - min_pfn = min(min_pfn, early_node_map[i].start_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) + min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { printk(KERN_WARNING @@ -4703,15 +4501,16 @@ unsigned long __init find_min_pfn_with_active_regions(void) */ static unsigned long __init early_calculate_totalpages(void) { - int i; unsigned long totalpages = 0; + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + unsigned long pages = end_pfn - start_pfn; - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long pages = early_node_map[i].end_pfn - - early_node_map[i].start_pfn; totalpages += pages; if (pages) - node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); + node_set_state(nid, N_HIGH_MEMORY); } return totalpages; } @@ -4766,6 +4565,8 @@ restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested @@ -4782,13 +4583,10 @@ restart: kernelcore_remaining = kernelcore_node; /* Go through each range of PFNs within this node */ - for_each_active_range_index_in_nid(i, nid) { - unsigned long start_pfn, end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; - start_pfn = max(early_node_map[i].start_pfn, - zone_movable_pfn[nid]); - end_pfn = early_node_map[i].end_pfn; + start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; @@ -4869,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat) for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) + if (zone->present_pages) { node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + break; + } } #endif } @@ -4890,11 +4690,8 @@ static void check_for_regular_memory(pg_data_t *pgdat) */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { - unsigned long nid; - int i; - - /* Sort early_node_map as initialisation assumes it is sorted */ - sort_node_map(); + unsigned long start_pfn, end_pfn; + int i, nid; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -4941,11 +4738,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) } /* Print out the early_node_map[] */ - printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); - for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + printk("Early memory PFN ranges\n"); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4998,7 +4793,7 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** * set_dma_reserve - set the specified number of pages reserved in the first zone @@ -5082,8 +4877,19 @@ static void calculate_totalreserve_pages(void) if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } |