diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 342 |
1 files changed, 209 insertions, 133 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3e0c69a97b7..c21b33668133 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> +#include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> @@ -714,7 +715,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +730,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -787,9 +785,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +799,16 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +822,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +839,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -865,12 +863,12 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -3007,18 +3005,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3033,6 +3025,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter, nodemask); +} + +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3041,11 +3047,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + + cpuset_print_current_mems_allowed(); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask, nodemask); +} + +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; } static inline struct page * @@ -3083,47 +3114,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3192,6 +3218,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3213,8 +3242,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3226,8 +3257,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3236,12 +3269,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * @@ -3464,6 +3500,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3474,8 +3512,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for @@ -3555,6 +3596,14 @@ retry_cpuset: no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or @@ -3566,14 +3615,6 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - - /* - * The fast path uses conservative alloc_flags to succeed only until - * kswapd needs to be woken up, and to avoid the cost of setting up - * alloc_flags precisely. So we do that now. - */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3650,35 +3691,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3702,14 +3729,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3738,6 +3757,10 @@ retry: if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3755,7 +3778,48 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; - warn_alloc(gfp_mask, + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + + cond_resched(); + goto retry; + } +fail: + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; @@ -4252,20 +4316,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4306,7 +4370,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4314,7 +4378,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4348,6 +4412,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" @@ -4397,7 +4464,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4462,7 +4529,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); @@ -5083,8 +5150,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) |