summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorTony Lindgren <tony@atomide.com>2016-03-30 20:36:06 +0300
committerTony Lindgren <tony@atomide.com>2016-03-30 20:36:06 +0300
commit1809de7e7d37c585e01a1bcc583ea92b78fc759d (patch)
tree76c5b35c2b04eafce86a1a729c02ab705eba44bc /mm/page_alloc.c
parentebf24414809200915b9ddf7f109bba7c278c8210 (diff)
parent3ca4a238106dedc285193ee47f494a6584b6fd2f (diff)
downloadlinux-1809de7e7d37c585e01a1bcc583ea92b78fc759d.tar.xz
Merge tag 'for-v4.6-rc/omap-fixes-a' of git://git.kernel.org/pub/scm/linux/kernel/git/pjw/omap-pending into omap-for-v4.6/fixes
ARM: OMAP2+: first hwmod fix for v4.6-rc Fix a longstanding bug in the hwmod code that could cause hardware SYSCONFIG register values to not match the kernel's idea of what they should be, and that could result in lower performance during IP block idle entry. Basic build, boot, and PM test logs are available here: http://www.pwsan.com/omap/testlogs/omap-hwmod-fixes-a-for-v4.6-rc/20160326231727/
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c540
1 files changed, 423 insertions, 117 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..59de90d5d3a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
@@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
+ unsigned long max_initialise;
+
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
- /* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > max_initialise) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason,
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
@@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason,
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
- printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- unsigned int max_order = MAX_ORDER;
+ unsigned int max_order;
+
+ max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
- if (is_migrate_isolate(migratetype)) {
- /*
- * We restrict max order of merging to prevent merge
- * between freepages on isolate pageblock and normal
- * pageblock. Without this, pageblock isolation
- * could cause incorrect freepage accounting.
- */
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
- } else {
+ if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- }
- page_idx = pfn & ((1 << max_order) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
+continue_merging:
while (order < max_order - 1) {
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
- break;
+ goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page,
page_idx = combined_idx;
order++;
}
+ if (max_order < MAX_ORDER) {
+ /* If we are here, it means order is >= pageblock_order.
+ * We want to prevent merge between freepages on isolate
+ * pageblock and normal pageblock. Without this, pageblock
+ * isolation could cause incorrect freepage or CMA accounting.
+ *
+ * We don't want to hit this code for the more frequent
+ * low-order merging.
+ */
+ if (unlikely(has_isolate_pageblock(zone))) {
+ int buddy_mt;
+
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
+ buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (is_migrate_isolate(migratetype) ||
+ is_migrate_isolate(buddy_mt)))
+ goto done_merging;
+ }
+ max_order++;
+ goto continue_merging;
+ }
+
+done_merging:
set_page_order(page, order);
/*
@@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
@@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
@@ -1254,9 +1376,13 @@ free_range:
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled() && poisoned;
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
int i;
+ bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return 1;
+ if (poisoned)
+ poisoned &= page_is_poisoned(p);
}
set_page_private(page, 0);
@@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
list_del(&page->lru);
pcp->count--;
} else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
page = NULL;
@@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
- /*
- * If this allocation cannot block and it is for a specific node, then
- * fail early. There's no need to wakeup kswapd or retry for a
- * speculative node-specific allocation.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
- goto nopage;
-
retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3372,7 +3498,7 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(size - 1, &page->_count);
+ page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3384,7 +3510,7 @@ refill:
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3392,7 +3518,7 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
+ set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
@@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
@@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s)
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
- printk(KERN_WARNING
- "Ignoring invalid numa_zonelist_order value: "
- "%s\n", s);
+ pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. "
- "Total pages: %ld\n",
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? "off" : "on",
- vm_total_pages);
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
@@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
+ */
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
@@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
@@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
@@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
@@ -5210,13 +5454,22 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&pgdat->split_queue_lock);
+ INIT_LIST_HEAD(&pgdat->split_queue);
+ pgdat->split_queue_len = 0;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5235,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
@@ -5285,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
@@ -5353,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5443,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
- printk(KERN_WARNING
- "Could not find start_pfn for node %d\n", nid);
+ pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -5524,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5783,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
@@ -5880,22 +6168,21 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- pr_info("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
+ ", %luK highmem"
#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
- totalcma_pages << (PAGE_SHIFT-10),
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
+ totalhigh_pages << (PAGE_SHIFT - 10),
#endif
- str ? ", " : "", str ? str : "");
+ str ? ", " : "", str ? str : "");
}
/**
@@ -6070,8 +6357,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6212,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -6403,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename,
- (1UL << log2qty),
- ilog2(size) - PAGE_SHIFT,
- size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
if (_hash_shift)
*_hash_shift = log2qty;
@@ -6558,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
- if (!atomic_read(&page->_count)) {
+ if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
@@ -6615,7 +6923,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
@@ -6908,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
+ pr_info("remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
@@ -6922,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -6941,4 +7248,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif