diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 233 |
1 files changed, 138 insertions, 95 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bb3484563ed..ac1fc986af44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +bool deferred_struct_pages __meminitdata; + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once @@ -443,15 +445,15 @@ static inline bool deferred_pages_enabled(void) return static_branch_unlikely(&deferred_pages); } -/* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __meminit early_page_uninitialised(unsigned long pfn) +/* Returns true if the struct page for the pfn is initialised */ +static inline bool __meminit early_page_initialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; + return false; - return false; + return true; } /* @@ -498,9 +500,9 @@ static inline bool deferred_pages_enabled(void) return false; } -static inline bool early_page_uninitialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn) { - return false; + return true; } static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) @@ -775,11 +777,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -805,7 +809,7 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + enum compound_dtor_id dtor = folio->_folio_dtor; VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); @@ -1291,6 +1295,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1306,16 +1311,16 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(head_compound_mapcount(head_page))) { - bad_page(page, "nonzero compound_mapcount"); + if (unlikely(folio_entire_mapcount(folio))) { + bad_page(page, "nonzero entire_mapcount"); goto out; } - if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { - bad_page(page, "nonzero subpages_mapcount"); + if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + bad_page(page, "nonzero nr_pages_mapped"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; @@ -1356,6 +1361,8 @@ out: * see the comment next to it. * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, * see the comment next to it. + * 4. The allocation is excluded from being checked due to sampling, + * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages @@ -1403,7 +1410,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); page_table_check_free(page, order); @@ -1434,7 +1441,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free && free_page_is_bad(page)) bad++; @@ -1641,7 +1648,7 @@ static void __meminit init_reserved_page(unsigned long pfn) pg_data_t *pgdat; int nid, zid; - if (!early_page_uninitialised(pfn)) + if (early_page_initialised(pfn)) return; nid = early_pfn_to_nid(pfn); @@ -1804,7 +1811,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (early_page_uninitialised(pfn)) + if (!early_page_initialised(pfn)) return; if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ @@ -2468,7 +2475,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool reset_tags = true; int i; set_page_private(page, 0); @@ -2491,30 +2499,43 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed (which happens only when memory - * should be initialized as well). + * If memory tags should be zeroed + * (which happens only when memory should be initialized as well). */ - if (init_tags) { - /* Initialize both memory and tags. */ + if (zero_tags) { + /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); - /* Note that memory is already initialized by the loop above. */ + /* Take note that memory was initialized by the loop above. */ init = false; } if (!should_skip_kasan_unpoison(gfp_flags)) { - /* Unpoison shadow memory or set memory tags. */ - kasan_unpoison_pages(page, order, init); - - /* Note that memory is already initialized by KASAN. */ - if (kasan_has_integrated_init()) - init = false; - } else { - /* Ensure page_address() dereferencing does not fault. */ + /* Try unpoisoning (or setting tags) and initializing memory. */ + if (kasan_unpoison_pages(page, order, init)) { + /* Take note that memory was initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + /* Take note that memory tags were set by KASAN. */ + reset_tags = false; + } else { + /* + * KASAN decided to exclude this allocation from being + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. + */ + SetPageSkipKASanPoison(page); + } + } + /* + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. + */ + if (reset_tags) { for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); } - /* If memory is still not initialized, do it now. */ + /* If memory is still not initialized, initialize it now. */ if (init) kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ @@ -2582,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, }; #ifdef CONFIG_CMA @@ -2844,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; *can_steal = false; - for (i = 0;; i++) { + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_TYPES) - break; - if (free_area_empty(area, fallback_mt)) continue; @@ -3706,10 +3724,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) + if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* + * If the allocation fails, allow OOM handling access + * to HIGHATOMIC reserves as failing now is worse than + * failing a high-order atomic allocation in the + * future. + */ + if (!page && (alloc_flags & ALLOC_OOM)) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { spin_unlock_irqrestore(&zone->lock, flags); return NULL; @@ -3939,15 +3967,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); long unusable_free = (1 << order) - 1; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * If the caller does not have rights to reserves below the min + * watermark then subtract the high-atomic reserves. This will + * over-estimate the size of the atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!(alloc_flags & ALLOC_RESERVES))) unusable_free += z->nr_reserved_highatomic; #ifdef CONFIG_CMA @@ -3971,25 +3998,37 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_HIGH) - min -= min / 2; + if (unlikely(alloc_flags & ALLOC_RESERVES)) { + /* + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ + if (alloc_flags & ALLOC_MIN_RESERVE) { + min -= min / 2; + + /* + * Non-blocking allocations (e.g. GFP_ATOMIC) can + * access more reserves than just __GFP_HIGH. Other + * non-blocking allocations requests such as GFP_NOWAIT + * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get + * access to the min reserve. + */ + if (alloc_flags & ALLOC_NON_BLOCK) + min -= min / 4; + } - if (unlikely(alloc_harder)) { /* - * OOM victims can try even harder than normal ALLOC_HARDER + * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; - else - min -= min / 4; } /* @@ -4023,8 +4062,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && + !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; + } } return false; } @@ -4064,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4244,7 +4286,7 @@ retry: * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4286,14 +4328,14 @@ try_this_zone: * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) reserve_highatomic_pageblock(page, zone, order); return page; } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4813,41 +4855,48 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, } static inline unsigned int -gfp_to_alloc_flags(gfp_t gfp_mask) +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* - * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD * to save two branches. */ - BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) - alloc_flags |= ALLOC_HARDER; + if (!(gfp_mask & __GFP_NOMEMALLOC)) { + alloc_flags |= ALLOC_NON_BLOCK; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + /* - * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed(). + * Ignore cpuset mems for non-blocking __GFP_HIGH (probably + * GFP_ATOMIC) rather than fail, see the comment for + * __cpuset_node_allowed(). */ - alloc_flags &= ~ALLOC_CPUSET; + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_MIN_RESERVE; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5028,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; @@ -5048,7 +5089,7 @@ restart: * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + alloc_flags = gfp_to_alloc_flags(gfp_mask, order); /* * We need to recalculate the starting point for the zonelist iterator @@ -5276,12 +5317,13 @@ nopage: WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* - * Help non-failing allocations by giving them access to memory - * reserves but do not use ALLOC_NO_WATERMARKS because this + * Help non-failing allocations by giving some access to memory + * reserves normally used for high priority non-blocking + * allocations but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make - * the situation worse + * the situation worse. */ - page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); if (page) goto got_pg; @@ -5390,7 +5432,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto out; /* Bulk allocator does not support memcg accounting. */ - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) goto failed; /* Use the single page allocator for one page. */ @@ -5562,7 +5604,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; @@ -6764,8 +6806,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, zone_end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) { + deferred_struct_pages = true; break; + } } page = pfn_to_page(pfn); @@ -7929,6 +7973,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); + lru_gen_init_pgdat(pgdat); } static void __init free_area_init_memoryless_node(int nid) @@ -8363,11 +8408,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); - if (!pgdat) { - pr_err("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - continue; - } + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); free_area_init_memoryless_node(nid); @@ -8571,7 +8614,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); - mlock_page_drain_remote(cpu); + mlock_drain_remote(cpu); drain_pages(cpu); /* |