diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1046 |
1 files changed, 297 insertions, 749 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b..7d3460c7a480 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,21 +18,14 @@ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/swapops.h> #include <linux/interrupt.h> -#include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> #include <linux/kmsan.h> #include <linux/module.h> #include <linux/suspend.h> -#include <linux/pagevec.h> -#include <linux/blkdev.h> -#include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/topology.h> @@ -41,19 +34,8 @@ #include <linux/cpuset.h> #include <linux/memory_hotplug.h> #include <linux/nodemask.h> -#include <linux/vmalloc.h> #include <linux/vmstat.h> -#include <linux/mempolicy.h> -#include <linux/memremap.h> -#include <linux/stop_machine.h> -#include <linux/random.h> -#include <linux/sort.h> -#include <linux/pfn.h> -#include <linux/backing-dev.h> #include <linux/fault-inject.h> -#include <linux/page-isolation.h> -#include <linux/debugobjects.h> -#include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <trace/events/oom.h> @@ -61,26 +43,19 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/hugetlb.h> -#include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/page_table_check.h> -#include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> -#include <linux/nmi.h> #include <linux/psi.h> #include <linux/khugepaged.h> #include <linux/delayacct.h> -#include <asm/sections.h> -#include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" #include "page_reporting.h" -#include "swap.h" /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@ -227,18 +202,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -atomic_long_t _totalram_pages __read_mostly; -EXPORT_SYMBOL(_totalram_pages); -unsigned long totalreserve_pages __read_mostly; -unsigned long totalcma_pages __read_mostly; - -int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); -EXPORT_SYMBOL(init_on_alloc); - -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); -EXPORT_SYMBOL(init_on_free); /* * A cached value of the page's pageblock's migratetype, used when the page is @@ -258,44 +222,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) page->index = migratetype; } -#ifdef CONFIG_PM_SLEEP -/* - * The following functions are used by the suspend/hibernate code to temporarily - * change gfp_allowed_mask in order to avoid using I/O during memory allocations - * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with system_transition_mutex held - * (gfp_allowed_mask also should only be modified with system_transition_mutex - * held, unless the suspend/hibernate code is guaranteed not to run in parallel - * with that modification). - */ - -static gfp_t saved_gfp_mask; - -void pm_restore_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } -} - -void pm_restrict_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); - saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); -} - -bool pm_suspended_storage(void) -{ - if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - return false; - return true; -} -#endif /* CONFIG_PM_SLEEP */ - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -314,7 +240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { +static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA [ZONE_DMA] = 256, #endif @@ -358,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { +static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE @@ -371,10 +297,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; -int watermark_scale_factor = 10; - -bool mirrored_kernelcore __initdata_memblock; +static int watermark_boost_factor __read_mostly = 15000; +static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -387,6 +311,12 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +static bool page_contains_unaccepted(struct page *page, unsigned int order); +static void accept_page(struct page *page, unsigned int order); +static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static inline bool has_unaccepted_memory(void); +static bool __free_unaccepted(struct page *page); + int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -550,13 +480,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) return ret; } -static int page_is_consistent(struct zone *zone, struct page *page) -{ - if (zone != page_zone(page)) - return 0; - - return 1; -} /* * Temporary debugging check for pages not lying within a given zone. */ @@ -564,7 +487,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; - if (!page_is_consistent(zone, page)) + if (zone != page_zone(page)) return 1; return 0; @@ -704,75 +627,6 @@ void destroy_large_folio(struct folio *folio) compound_page_dtors[dtor](&folio->page); } -#ifdef CONFIG_DEBUG_PAGEALLOC -unsigned int _debug_guardpage_minorder; - -bool _debug_pagealloc_enabled_early __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); -EXPORT_SYMBOL(_debug_pagealloc_enabled_early); -DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -EXPORT_SYMBOL(_debug_pagealloc_enabled); - -DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static int __init early_debug_pagealloc(char *buf) -{ - return kstrtobool(buf, &_debug_pagealloc_enabled_early); -} -early_param("debug_pagealloc", early_debug_pagealloc); - -static int __init debug_guardpage_minorder_setup(char *buf) -{ - unsigned long res; - - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); - return 0; - } - _debug_guardpage_minorder = res; - pr_info("Setting debug_guardpage_minorder to %lu\n", res); - return 0; -} -early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); - -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return false; - - if (order >= debug_guardpage_minorder()) - return false; - - __SetPageGuard(page); - INIT_LIST_HEAD(&page->buddy_list); - set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); - - return true; -} - -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return; - - __ClearPageGuard(page); - - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); -} -#else -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} -#endif - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -879,7 +733,7 @@ static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { return list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + struct page, buddy_list); } /* @@ -1131,6 +985,11 @@ static inline bool free_page_is_bad(struct page *page) return true; } +static inline bool is_check_pages_enabled(void) +{ + return static_branch_unlikely(&check_pages_enabled); +} + static int free_tail_page_prepare(struct page *head_page, struct page *page) { struct folio *folio = (struct folio *)head_page; @@ -1142,7 +1001,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) */ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - if (!static_branch_unlikely(&check_pages_enabled)) { + if (!is_check_pages_enabled()) { ret = 0; goto out; } @@ -1481,6 +1340,13 @@ void __free_pages_core(struct page *page, unsigned int order) atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + if (page_contains_unaccepted(page, order)) { + if (order == MAX_ORDER && __free_unaccepted(page)) + return; + + accept_page(page, order); + } + /* * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. @@ -1521,7 +1387,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, /* end_pfn is one past the range we are checking */ end_pfn--; - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + if (!pfn_valid(end_pfn)) return NULL; start_page = pfn_to_online_page(start_pfn); @@ -1540,33 +1406,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, return start_page; } -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - -void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression @@ -2501,61 +2340,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -#ifdef CONFIG_HIBERNATION - -/* - * Touch the watchdog for every WD_PAGE_COUNT pages. - */ -#define WD_PAGE_COUNT (128*1024) - -void mark_free_pages(struct zone *zone) -{ - unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; - unsigned long flags; - unsigned int order, t; - struct page *page; - - if (zone_is_empty(zone)) - return; - - spin_lock_irqsave(&zone->lock, flags); - - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - - if (page_zone(page) != zone) - continue; - - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } - - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { - unsigned long i; - - pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) { - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - swsusp_set_page_free(pfn_to_page(pfn + i)); - } - } - } - spin_unlock_irqrestore(&zone->lock, flags); -} -#endif /* CONFIG_PM */ - static bool free_unref_page_prepare(struct page *page, unsigned long pfn, unsigned int order) { @@ -3052,7 +2836,8 @@ struct page *rmqueue(struct zone *preferred_zone, out: /* Separate test+clear to avoid unnecessary atomics */ - if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { + if ((alloc_flags & ALLOC_KSWAPD) && + unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } @@ -3061,80 +2846,6 @@ out: return page; } -#ifdef CONFIG_FAIL_PAGE_ALLOC - -static struct { - struct fault_attr attr; - - bool ignore_gfp_highmem; - bool ignore_gfp_reclaim; - u32 min_order; -} fail_page_alloc = { - .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_reclaim = true, - .ignore_gfp_highmem = true, - .min_order = 1, -}; - -static int __init setup_fail_page_alloc(char *str) -{ - return setup_fault_attr(&fail_page_alloc.attr, str); -} -__setup("fail_page_alloc=", setup_fail_page_alloc); - -static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - int flags = 0; - - if (order < fail_page_alloc.min_order) - return false; - if (gfp_mask & __GFP_NOFAIL) - return false; - if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return false; - if (fail_page_alloc.ignore_gfp_reclaim && - (gfp_mask & __GFP_DIRECT_RECLAIM)) - return false; - - /* See comment in __should_failslab() */ - if (gfp_mask & __GFP_NOWARN) - flags |= FAULT_NOWARN; - - return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -static int __init fail_page_alloc_debugfs(void) -{ - umode_t mode = S_IFREG | 0600; - struct dentry *dir; - - dir = fault_create_debugfs_attr("fail_page_alloc", NULL, - &fail_page_alloc.attr); - - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim); - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - - return 0; -} - -late_initcall(fail_page_alloc_debugfs); - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - -#else /* CONFIG_FAIL_PAGE_ALLOC */ - -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - return false; -} - -#endif /* CONFIG_FAIL_PAGE_ALLOC */ - noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); @@ -3159,6 +2870,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + unusable_free += zone_page_state(z, NR_UNACCEPTED); +#endif return unusable_free; } @@ -3458,6 +3172,11 @@ retry: gfp_mask)) { int ret; + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can @@ -3510,6 +3229,11 @@ try_this_zone: return page; } else { + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, order)) + goto try_this_zone; + } + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { @@ -3768,56 +3492,41 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (fatal_signal_pending(current)) return false; - if (compaction_made_progress(compact_result)) - (*compaction_retries)++; - - /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the - * failure could be caused by insufficient priority - */ - if (compaction_failed(compact_result)) - goto check_priority; - /* - * compaction was skipped because there are not enough order-0 pages - * to work with, so we retry only if it looks like reclaim can help. + * Compaction was skipped due to a lack of free order-0 + * migration targets. Continue if reclaim can help. */ - if (compaction_needs_reclaim(compact_result)) { + if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But the next retry should use a higher priority if allowed, so - * we don't just keep bailing out endlessly. + * Compaction managed to coalesce some page blocks, but the + * allocation failed presumably due to a race. Retry some. */ - if (compaction_withdrawn(compact_result)) { - goto check_priority; - } + if (compact_result == COMPACT_SUCCESS) { + /* + * !costly requests are much more important than + * __GFP_RETRY_MAYFAIL costly ones because they are de + * facto nofail and invoke OOM killer to move on while + * costly can fail and users are ready to cope with + * that. 1/4 retries is rather arbitrary but we would + * need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; - /* - * !costly requests are much more important than __GFP_RETRY_MAYFAIL - * costly ones because they are de facto nofail and invoke OOM - * killer to move on while costly can fail and users are ready - * to cope with that. 1/4 retries is rather arbitrary but we - * would need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - if (*compaction_retries <= max_retries) { - ret = true; - goto out; + if (++(*compaction_retries) <= max_retries) { + ret = true; + goto out; + } } /* - * Make sure there are attempts at the highest priority if we exhausted - * all retries or failed at the lower priorities. + * Compaction failed. Retry with increasing priority. */ -check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; @@ -5137,383 +4846,6 @@ unsigned long nr_free_buffer_pages(void) } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -static inline void show_node(struct zone *zone) -{ - if (IS_ENABLED(CONFIG_NUMA)) - printk("Node %d ", zone_to_nid(zone)); -} - -long si_mem_available(void) -{ - long available; - unsigned long pagecache; - unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; - unsigned long reclaimable; - struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); - - for_each_zone(zone) - wmark_low += low_wmark_pages(zone); - - /* - * Estimate the amount of memory available for userspace allocations, - * without causing swapping or OOM. - */ - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; - - /* - * Not all the page cache can be freed, otherwise the system will - * start swapping or thrashing. Assume at least half of the page - * cache, or the low watermark worth of cache, needs to stay. - */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; - pagecache -= min(pagecache / 2, wmark_low); - available += pagecache; - - /* - * Part of the reclaimable slab and other kernel memory consists of - * items that are in use, and cannot be freed. Cap this estimate at the - * low watermark. - */ - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - available += reclaimable - min(reclaimable / 2, wmark_low); - - if (available < 0) - available = 0; - return available; -} -EXPORT_SYMBOL_GPL(si_mem_available); - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = totalram_pages(); - val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_zone_page_state(NR_FREE_PAGES); - val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages(); - val->freehigh = nr_free_highpages(); - val->mem_unit = PAGE_SIZE; -} - -EXPORT_SYMBOL(si_meminfo); - -#ifdef CONFIG_NUMA -void si_meminfo_node(struct sysinfo *val, int nid) -{ - int zone_type; /* needs to be signed */ - unsigned long managed_pages = 0; - unsigned long managed_highpages = 0; - unsigned long free_highpages = 0; - pg_data_t *pgdat = NODE_DATA(nid); - - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - struct zone *zone = &pgdat->node_zones[zone_type]; - - if (is_highmem(zone)) { - managed_highpages += zone_managed_pages(zone); - free_highpages += zone_page_state(zone, NR_FREE_PAGES); - } - } - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif - val->mem_unit = PAGE_SIZE; -} -#endif - -/* - * Determine whether the node should be displayed or not, depending on whether - * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). - */ -static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) -{ - if (!(flags & SHOW_MEM_FILTER_NODES)) - return false; - - /* - * no node mask - aka implicit memory numa policy. Do not bother with - * the synchronization - read_mems_allowed_begin - because we do not - * have to be precise here. - */ - if (!nodemask) - nodemask = &cpuset_current_mems_allowed; - - return !node_isset(nid, *nodemask); -} - -static void show_migration_types(unsigned char type) -{ - static const char types[MIGRATE_TYPES] = { - [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RECLAIMABLE] = 'E', - [MIGRATE_HIGHATOMIC] = 'H', -#ifdef CONFIG_CMA - [MIGRATE_CMA] = 'C', -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = 'I', -#endif - }; - char tmp[MIGRATE_TYPES + 1]; - char *p = tmp; - int i; - - for (i = 0; i < MIGRATE_TYPES; i++) { - if (type & (1 << i)) - *p++ = types[i]; - } - - *p = '\0'; - printk(KERN_CONT "(%s) ", tmp); -} - -static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) -{ - int zone_idx; - for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) - if (zone_managed_pages(pgdat->node_zones + zone_idx)) - return true; - return false; -} - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - * - * Bits in @filter: - * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's - * cpuset. - */ -void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) -{ - unsigned long free_pcp = 0; - int cpu, nid; - struct zone *zone; - pg_data_t *pgdat; - - for_each_populated_zone(zone) { - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - } - - printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" - " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu\n" - " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu\n" - " sec_pagetables:%lu bounce:%lu\n" - " kernel_misc_reclaimable:%lu\n" - " free:%lu free_pcp:%lu free_cma:%lu\n", - global_node_page_state(NR_ACTIVE_ANON), - global_node_page_state(NR_INACTIVE_ANON), - global_node_page_state(NR_ISOLATED_ANON), - global_node_page_state(NR_ACTIVE_FILE), - global_node_page_state(NR_INACTIVE_FILE), - global_node_page_state(NR_ISOLATED_FILE), - global_node_page_state(NR_UNEVICTABLE), - global_node_page_state(NR_FILE_DIRTY), - global_node_page_state(NR_WRITEBACK), - global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), - global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), - global_node_page_state(NR_FILE_MAPPED), - global_node_page_state(NR_SHMEM), - global_node_page_state(NR_PAGETABLE), - global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), - global_zone_page_state(NR_FREE_PAGES), - free_pcp, - global_zone_page_state(NR_FREE_CMA_PAGES)); - - for_each_online_pgdat(pgdat) { - if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) - continue; - if (!node_has_managed_zones(pgdat, max_zone_idx)) - continue; - - printk("Node %d" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" - " mapped:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " shmem:%lukB" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" -#endif - " writeback_tmp:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif - " pagetables:%lukB" - " sec_pagetables:%lukB" - " all_unreclaimable? %s" - "\n", - pgdat->node_id, - K(node_page_state(pgdat, NR_ACTIVE_ANON)), - K(node_page_state(pgdat, NR_INACTIVE_ANON)), - K(node_page_state(pgdat, NR_ACTIVE_FILE)), - K(node_page_state(pgdat, NR_INACTIVE_FILE)), - K(node_page_state(pgdat, NR_UNEVICTABLE)), - K(node_page_state(pgdat, NR_ISOLATED_ANON)), - K(node_page_state(pgdat, NR_ISOLATED_FILE)), - K(node_page_state(pgdat, NR_FILE_MAPPED)), - K(node_page_state(pgdat, NR_FILE_DIRTY)), - K(node_page_state(pgdat, NR_WRITEBACK)), - K(node_page_state(pgdat, NR_SHMEM)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS)), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), - K(node_page_state(pgdat, NR_ANON_THPS)), -#endif - K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - node_page_state(pgdat, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - node_page_state(pgdat, NR_KERNEL_SCS_KB), -#endif - K(node_page_state(pgdat, NR_PAGETABLE)), - K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? - "yes" : "no"); - } - - for_each_populated_zone(zone) { - int i; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - - free_pcp = 0; - for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; - - show_node(zone); - printk(KERN_CONT - "%s" - " free:%lukB" - " boost:%lukB" - " min:%lukB" - " low:%lukB" - " high:%lukB" - " reserved_highatomic:%luKB" - " active_anon:%lukB" - " inactive_anon:%lukB" - " active_file:%lukB" - " inactive_file:%lukB" - " unevictable:%lukB" - " writepending:%lukB" - " present:%lukB" - " managed:%lukB" - " mlocked:%lukB" - " bounce:%lukB" - " free_pcp:%lukB" - " local_pcp:%ukB" - " free_cma:%lukB" - "\n", - zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->watermark_boost), - K(min_wmark_pages(zone)), - K(low_wmark_pages(zone)), - K(high_wmark_pages(zone)), - K(zone->nr_reserved_highatomic), - K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), - K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), - K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), - K(zone->present_pages), - K(zone_managed_pages(zone)), - K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), - K(free_pcp), - K(this_cpu_read(zone->per_cpu_pageset->count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES))); - printk("lowmem_reserve[]:"); - for (i = 0; i < MAX_NR_ZONES; i++) - printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); - printk(KERN_CONT "\n"); - } - - for_each_populated_zone(zone) { - unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; - - if (zone_idx(zone) > max_zone_idx) - continue; - if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) - continue; - show_node(zone); - printk(KERN_CONT "%s: ", zone->name); - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[order]; - int type; - - nr[order] = area->nr_free; - total += nr[order] << order; - - types[order] = 0; - for (type = 0; type < MIGRATE_TYPES; type++) { - if (!free_area_empty(area, type)) - types[order] |= 1 << type; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { - printk(KERN_CONT "%lu*%lukB ", - nr[order], K(1UL) << order); - if (nr[order]) - show_migration_types(types[order]); - } - printk(KERN_CONT "= %lukB\n", K(total)); - } - - for_each_online_node(nid) { - if (show_mem_node_skip(filter, nid, nodemask)) - continue; - hugetlb_show_meminfo_node(nid); - } - - printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); - - show_swap_cache_info(); -} - static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -5560,12 +4892,12 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -char numa_zonelist_order[] = "Node"; - +static char numa_zonelist_order[] = "Node"; +#define NUMA_ZONELIST_ORDER_LEN 16 /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(struct ctl_table *table, int write, +static int numa_zonelist_order_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -5573,7 +4905,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, return proc_dostring(table, write, buffer, length, ppos); } - static int node_load[MAX_NUMNODES]; /** @@ -5976,6 +5307,7 @@ static int zone_batchsize(struct zone *zone) #endif } +static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU @@ -6505,7 +5837,7 @@ postcore_initcall(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, +static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6521,7 +5853,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6551,7 +5883,7 @@ static void setup_min_unmapped_ratio(void) } -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6578,7 +5910,7 @@ static void setup_min_slab_ratio(void) sysctl_min_slab_ratio) / 100; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -6602,8 +5934,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) +static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) { int i; @@ -6623,7 +5955,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, +static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -6656,9 +5988,83 @@ out: return ret; } +static struct ctl_table page_alloc_sysctl_table[] = { + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE_THOUSAND, + }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif + {} +}; + +void __init page_alloc_sysctl_init(void) +{ + register_sysctl_init("vm", page_alloc_sysctl_table); +} + #ifdef CONFIG_CONTIG_ALLOC -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) { @@ -6672,11 +6078,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list) dump_page(page, "migration failure"); } } -#else -static inline void alloc_contig_dump_pages(struct list_head *page_list) -{ -} -#endif /* [start, end) must belong to a single zone. */ int __alloc_contig_migrate_range(struct compact_control *cc, @@ -7215,3 +6616,150 @@ bool has_managed_dma(void) return false; } #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_UNACCEPTED_MEMORY + +/* Counts number of zones with unaccepted pages. */ +static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); + +static bool lazy_accept = true; + +static int __init accept_memory_parse(char *p) +{ + if (!strcmp(p, "lazy")) { + lazy_accept = true; + return 0; + } else if (!strcmp(p, "eager")) { + lazy_accept = false; + return 0; + } else { + return -EINVAL; + } +} +early_param("accept_memory", accept_memory_parse); + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + phys_addr_t end = start + (PAGE_SIZE << order); + + return range_contains_unaccepted_memory(start, end); +} + +static void accept_page(struct page *page, unsigned int order) +{ + phys_addr_t start = page_to_phys(page); + + accept_memory(start, start + (PAGE_SIZE << order)); +} + +static bool try_to_accept_memory_one(struct zone *zone) +{ + unsigned long flags; + struct page *page; + bool last; + + if (list_empty(&zone->unaccepted_pages)) + return false; + + spin_lock_irqsave(&zone->lock, flags); + page = list_first_entry_or_null(&zone->unaccepted_pages, + struct page, lru); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return false; + } + + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + accept_page(page, MAX_ORDER); + + __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + + if (last) + static_branch_dec(&zones_with_unaccepted_pages); + + return true; +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + long to_accept; + int ret = false; + + /* How much to accept to get to high watermark? */ + to_accept = high_wmark_pages(zone) - + (zone_page_state(zone, NR_FREE_PAGES) - + __zone_watermark_unusable_free(zone, order, 0)); + + /* Accept at least one page */ + do { + if (!try_to_accept_memory_one(zone)) + break; + ret = true; + to_accept -= MAX_ORDER_NR_PAGES; + } while (to_accept > 0); + + return ret; +} + +static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + +static bool __free_unaccepted(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + bool first = false; + + if (!lazy_accept) + return false; + + spin_lock_irqsave(&zone->lock, flags); + first = list_empty(&zone->unaccepted_pages); + list_add_tail(&page->lru, &zone->unaccepted_pages); + __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + spin_unlock_irqrestore(&zone->lock, flags); + + if (first) + static_branch_inc(&zones_with_unaccepted_pages); + + return true; +} + +#else + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ + return false; +} + +static void accept_page(struct page *page, unsigned int order) +{ +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + return false; +} + +static inline bool has_unaccepted_memory(void) +{ + return false; +} + +static bool __free_unaccepted(struct page *page) +{ + BUILD_BUG(); + return false; +} + +#endif /* CONFIG_UNACCEPTED_MEMORY */ |