summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c181
1 files changed, 120 insertions, 61 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
{
unsigned long page_idx;
unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
- combined_idx = __find_combined_index(page_idx, order);
- higher_page = page + combined_idx - page_idx;
- higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0; i < (1 << order); i++) {
- struct page *pg = page + i;
-
- if (PageAnon(pg))
- pg->mapping = NULL;
- bad += free_pages_check(pg);
- }
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page + i);
if (bad)
return false;
@@ -1095,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ pcp->count = 0;
+ }
local_irq_restore(flags);
}
}
@@ -1460,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1793,15 +1807,18 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
struct page *page;
if (!order || compaction_deferred(preferred_zone))
return NULL;
+ current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask);
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1854,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
return NULL;
}
@@ -1852,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
@@ -1920,19 +1937,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1978,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2012,6 +2036,14 @@ restart:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+ first_zones_zonelist(zonelist, high_zoneidx, NULL,
+ &preferred_zone);
+
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2034,21 +2066,26 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /* Try direct compaction */
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ migratetype, &did_some_progress,
+ sync_migration);
if (page)
goto got_pg;
+ sync_migration = true;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2139,27 @@ rebalance:
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
show_mem();
}
@@ -2151,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
get_mems_allowed();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+ first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
if (!preferred_zone) {
put_mems_allowed();
return NULL;
@@ -2442,7 +2495,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -2585,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -5517,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = {
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
- {1UL << PG_buddy, "buddy" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
@@ -5565,7 +5624,7 @@ void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_count(page), page_mapcount(page),
+ page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
}