diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-04 06:24:15 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-04 06:24:15 +0300 |
commit | ee01c4d72adffb7d424535adf630f2955748fa8b (patch) | |
tree | 9ea9f40473e105e936e7477ab7dc7248d899af21 /mm | |
parent | c444eb564fb16645c172d550359cb3d75fe8a040 (diff) | |
parent | 09587a09ada2ed7c39aedfa2681152b5ac5641ee (diff) | |
download | linux-ee01c4d72adffb7d424535adf630f2955748fa8b.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"More mm/ work, plenty more to come
Subsystems affected by this patch series: slub, memcg, gup, kasan,
pagealloc, hugetlb, vmscan, tools, mempolicy, memblock, hugetlbfs,
thp, mmap, kconfig"
* akpm: (131 commits)
arm64: mm: use ARCH_HAS_DEBUG_WX instead of arch defined
x86: mm: use ARCH_HAS_DEBUG_WX instead of arch defined
riscv: support DEBUG_WX
mm: add DEBUG_WX support
drivers/base/memory.c: cache memory blocks in xarray to accelerate lookup
mm/thp: rename pmd_mknotpresent() as pmd_mkinvalid()
powerpc/mm: drop platform defined pmd_mknotpresent()
mm: thp: don't need to drain lru cache when splitting and mlocking THP
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
sparc32: register memory occupied by kernel as memblock.memory
include/linux/memblock.h: fix minor typo and unclear comment
mm, mempolicy: fix up gup usage in lookup_node
tools/vm/page_owner_sort.c: filter out unneeded line
mm: swap: memcg: fix memcg stats for huge pages
mm: swap: fix vmstats for huge pages
mm: vmscan: limit the range of LRU type balancing
mm: vmscan: reclaim writepage is IO cost
mm: vmscan: determine anon/file pressure balance at the reclaim root
mm: balance LRU lists based on relative thrashing
mm: only count actual rotations as LRU reclaim cost
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/Kconfig.debug | 32 | ||||
-rw-r--r-- | mm/compaction.c | 70 | ||||
-rw-r--r-- | mm/filemap.c | 43 | ||||
-rw-r--r-- | mm/gup.c | 191 | ||||
-rw-r--r-- | mm/huge_memory.c | 270 | ||||
-rw-r--r-- | mm/hugetlb.c | 208 | ||||
-rw-r--r-- | mm/internal.h | 23 | ||||
-rw-r--r-- | mm/khugepaged.c | 262 | ||||
-rw-r--r-- | mm/memblock.c | 19 | ||||
-rw-r--r-- | mm/memcontrol.c | 532 | ||||
-rw-r--r-- | mm/memory.c | 52 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 5 | ||||
-rw-r--r-- | mm/migrate.c | 20 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 589 | ||||
-rw-r--r-- | mm/page_owner.c | 7 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 53 | ||||
-rw-r--r-- | mm/shmem.c | 110 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 135 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 6 | ||||
-rw-r--r-- | mm/swap_state.c | 104 | ||||
-rw-r--r-- | mm/swapfile.c | 25 | ||||
-rw-r--r-- | mm/userfaultfd.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 278 | ||||
-rw-r--r-- | mm/vmstat.c | 16 | ||||
-rw-r--r-- | mm/workingset.c | 21 |
31 files changed, 1478 insertions, 1635 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5c0362bd8d56..e3490ecac839 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool @@ -750,13 +747,13 @@ config DEFERRED_STRUCT_PAGE_INIT depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT + select PADATA help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up - a subset of memmap at boot and then initialise the rest in parallel - by starting one-off "pgdatinitX" kernel thread for each node X. This - has a potential performance impact on processes running early in the + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the lifetime of the system until these kthreads finish the initialisation. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 0271b22e063f..2409f7fc1567 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -118,6 +118,38 @@ config DEBUG_RODATA_TEST ---help--- This option enables a testcase for the setting rodata read-only. +config ARCH_HAS_DEBUG_WX + bool + +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on ARCH_HAS_DEBUG_WX + depends on MMU + select PTDUMP_CORE + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving W+X + mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + <arch>/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config GENERIC_PTDUMP bool diff --git a/mm/compaction.c b/mm/compaction.c index 99b73e31afd7..14d2fe231ea4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1409,7 +1409,9 @@ fast_isolate_freepages(struct compact_control *cc) cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { - page = pfn_to_page(min_pfn); + page = pageblock_pfn_to_page(min_pfn, + pageblock_end_pfn(min_pfn), + cc->zone); cc->free_pfn = min_pfn; } } @@ -1966,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1979,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1989,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2000,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2009,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2055,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2069,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2098,9 +2100,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2291,7 +2293,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2303,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2359,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2370,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2505,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2532,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2590,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2607,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2621,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2642,7 +2644,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/filemap.c b/mm/filemap.c index 3430280df607..455990621989 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; + mem_cgroup_migrate(old, new); + xas_lock_irqsave(&xas, flags); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(old)) - __dec_node_page_state(new, NR_FILE_PAGES); + __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); + __inc_lruvec_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(old)) - __dec_node_page_state(new, NR_SHMEM); + __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); + __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(&xas, flags); - mem_cgroup_migrate(old, new); if (freepage) freepage(old); put_page(old); @@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page, { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); - struct mem_cgroup *memcg; int error; void *old; @@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(PageSwapBacked(page), page); mapping_set_update(&xas, mapping); - if (!huge) { - error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg, false); - if (error) - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + if (!huge) { + error = mem_cgroup_charge(page, current->mm, gfp_mask); + if (error) + goto error; + } + do { xas_lock_irq(&xas); old = xas_load(&xas); @@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); + __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); - if (xas_error(&xas)) + if (xas_error(&xas)) { + error = xas_error(&xas); goto error; + } - if (!huge) - mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return xas_error(&xas); + return error; } ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); @@ -2636,7 +2633,7 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, NULL, page)) + if (alloc_set_pte(vmf, page)) goto unlock; unlock_page(page); goto next; @@ -989,6 +989,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * @@ -1265,6 +1266,10 @@ retry: } EXPORT_SYMBOL_GPL(fixup_user_fault); +/* + * Please note that this function, unlike __get_user_pages will not + * return 0 for nr_pages > 0 without FOLL_NOWAIT + */ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, @@ -2703,72 +2708,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -/* - * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * - * If the architecture does not support this function, simply return with no - * pages pinned. - * - * Careful, careful! COW breaking can go either way, so a non-write - * access can get ambiguous page results. If you call this function without - * 'write' set, you'd better be sure that you're ok with that ambiguity. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - unsigned long len, end; - unsigned long flags; - int nr_pinned = 0; - /* - * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, - * because gup fast is always a "pin with a +1 page refcount" request. - */ - unsigned int gup_flags = FOLL_GET; - - if (write) - gup_flags |= FOLL_WRITE; - - start = untagged_addr(start) & PAGE_MASK; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) - return 0; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * Disable interrupts. We use the nested form as we can already have - * interrupts disabled by get_futex_key. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - * - * NOTE! We allow read-only gup_fast() here, but you'd better be - * careful about possible COW pages. You'll get _a_ COW page, but - * not necessarily the one you intended to get depending on what - * COW event happens after this. COW may break the page copy in a - * random direction. - */ - - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); - local_irq_restore(flags); - } - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { @@ -2797,12 +2736,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; + unsigned long flags; int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN | FOLL_GET))) + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY))) return -EINVAL; + if (!(gup_flags & FOLL_FAST_ONLY)) + might_lock_read(¤t->mm->mmap_sem); + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; @@ -2819,16 +2763,36 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * order to avoid confusing the normal COW routines. So only * targets that are already writable are safe to do by just * looking at the page tables. + * + * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, + * because there is no slow path to fall back on. But you'd + * better be careful about possible COW pages - you'll get _a_ + * COW page, but not necessarily the one you intended to get + * depending on what COW event happens after this. COW may break + * the page copy in a random direction. + * + * Disable interrupts. The nested form is used, in order to allow + * full, general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_disable(); - gup_pgd_range(addr, end, gup_flags | FOLL_WRITE, pages, &nr_pinned); - local_irq_enable(); + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + unsigned long fast_flags = gup_flags; + if (!(gup_flags & FOLL_FAST_ONLY)) + fast_flags |= FOLL_WRITE; + + local_irq_save(flags); + gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); + local_irq_restore(flags); ret = nr_pinned; } - if (nr_pinned < nr_pages) { + if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { /* Try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; @@ -2848,6 +2812,51 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + int nr_pinned; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; + + if (write) + gup_flags |= FOLL_WRITE; + + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + + /* + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -2916,6 +2925,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the + * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dddc863b3cbc..e8669885232f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -658,7 +656,6 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return ret; @@ -1255,263 +1252,63 @@ unlock: spin_unlock(vmf->ptl); } -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, - pmd_t orig_pmd, struct page *page) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mem_cgroup *memcg; - pgtable_t pgtable; - pmd_t _pmd; - int i; - vm_fault_t ret = 0; - struct page **pages; - struct mmu_notifier_range range; - - pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), - GFP_KERNEL); - if (unlikely(!pages)) { - ret |= VM_FAULT_OOM; - goto out; - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, - vmf->address, page_to_nid(page)); - if (unlikely(!pages[i] || - mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, - GFP_KERNEL, &memcg, false))) { - if (pages[i]) - put_page(pages[i]); - while (--i >= 0) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, - false); - put_page(pages[i]); - } - kfree(pages); - ret |= VM_FAULT_OOM; - goto out; - } - set_page_private(pages[i], (unsigned long)memcg); - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - copy_user_highpage(pages[i], page + i, - haddr + PAGE_SIZE * i, vma); - __SetPageUptodate(pages[i]); - cond_resched(); - } - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_free_pages; - VM_BUG_ON_PAGE(!PageHead(page), page); - - /* - * Leave pmd empty until pte is filled note we must notify here as - * concurrent CPU thread might write to new page before the call to - * mmu_notifier_invalidate_range_end() happens which can lead to a - * device seeing memory write in different order than CPU. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); - pmd_populate(vma->vm_mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false, false); - lru_cache_add_active_or_unevictable(pages[i], vma); - vmf->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*vmf->pte)); - set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); - pte_unmap(vmf->pte); - } - kfree(pages); - - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, vmf->pmd, pgtable); - page_remove_rmap(page, true); - spin_unlock(vmf->ptl); - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); - - ret |= VM_FAULT_WRITE; - put_page(page); - -out: - return ret; - -out_free_pages: - spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(&range); - for (i = 0; i < HPAGE_PMD_NR; i++) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, false); - put_page(pages[i]); - } - kfree(pages); - goto out; -} - vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *new_page; - struct mem_cgroup *memcg; + struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mmu_notifier_range range; - gfp_t huge_gfp; /* for allocation and charge */ - vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) - goto alloc; + goto fallback; + spin_lock(vmf->ptl); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_unlock; + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. - */ + + /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); lock_page(page); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); unlock_page(page); put_page(page); - goto out_unlock; + return 0; } put_page(page); } + + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - ret |= VM_FAULT_WRITE; unlock_page(page); - goto out_unlock; - } - unlock_page(page); - get_page(page); - spin_unlock(vmf->ptl); -alloc: - if (__transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); - } else - new_page = NULL; - - if (likely(new_page)) { - prep_transhuge_page(new_page); - } else { - if (!page) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } else { - ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); - if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } - put_page(page); - } - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { - put_page(new_page); - split_huge_pmd(vma, vmf->pmd, vmf->address); - if (page) - put_page(page); - ret |= VM_FAULT_FALLBACK; - count_vm_event(THP_FAULT_FALLBACK); - count_vm_event(THP_FAULT_FALLBACK_CHARGE); - goto out; - } - - count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); - - if (!page) - clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); - else - copy_user_huge_page(new_page, page, vmf->address, - vma, HPAGE_PMD_NR); - __SetPageUptodate(new_page); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - spin_lock(vmf->ptl); - if (page) - put_page(page); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(new_page, memcg, true); - put_page(new_page); - goto out_mn; - } else { - pmd_t entry; - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - if (!page) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - } else { - VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page, true); - put_page(page); - } - ret |= VM_FAULT_WRITE; + return VM_FAULT_WRITE; } + + unlock_page(page); spin_unlock(vmf->ptl); -out_mn: - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); -out: - return ret; -out_unlock: - spin_unlock(vmf->ptl); - return ret; +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; } /* @@ -1581,7 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto skip_mlock; if (!trylock_page(page)) goto skip_mlock; - lru_add_drain(); if (page->mapping && !PageDoubleMap(page)) mlock_vma_page(page); unlock_page(page); @@ -2359,15 +2155,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_inc(&page[i]._mapcount); } + lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) atomic_dec(&page[i]._mapcount); } } + unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2808,7 +2606,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - bool mlocked; unsigned long flags; pgoff_t end; @@ -2867,14 +2664,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - mlocked = PageMlocked(head); unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); - /* Make sure the page is not on per-CPU pagevec as it takes pin */ - if (mlocked) - lru_add_drain(); - /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irqsave(&pgdata->lru_lock, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9a97320e1de..ac0d7bbc0692 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -59,8 +59,8 @@ __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; -static unsigned long __initdata default_hstate_size; static bool __initdata parsed_valid_hugepagesz = true; +static bool __initdata parsed_default_hugepagesz; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -3060,7 +3060,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("Hugetlb: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s", h->name); } } @@ -3164,7 +3164,7 @@ static void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; @@ -3212,23 +3212,41 @@ static int __init hugetlb_init(void) { int i; - if (!hugepages_supported()) + if (!hugepages_supported()) { + if (hugetlb_max_hstate || default_hstate_max_huge_pages) + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; + } - if (!size_to_hstate(default_hstate_size)) { - if (default_hstate_size != 0) { - pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", - default_hstate_size, HPAGE_SIZE); + /* + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some + * architectures depend on setup being done here. + */ + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + if (!parsed_default_hugepagesz) { + /* + * If we did not parse a default huge page size, set + * default_hstate_idx to HPAGE_SIZE hstate. And, if the + * number of huge pages for this default size was implicitly + * specified, set that here as well. + * Note that the implicit setting will overwrite an explicit + * setting. A warning will be printed in this case. + */ + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); + if (default_hstate_max_huge_pages) { + if (default_hstate.max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), + 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", + default_hstate_max_huge_pages); + } + default_hstate.max_huge_pages = + default_hstate_max_huge_pages; } - - default_hstate_size = HPAGE_SIZE; - if (!size_to_hstate(default_hstate_size)) - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - } - default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) { - if (!default_hstate.max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; } hugetlb_cma_check(); @@ -3256,10 +3274,10 @@ static int __init hugetlb_init(void) } subsys_initcall(hugetlb_init); -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_bad_size(void) +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { - parsed_valid_hugepagesz = false; + return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) @@ -3268,7 +3286,6 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -3289,20 +3306,29 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } -static int __init hugetlb_nrpages_setup(char *s) +/* + * hugepages command line processing + * hugepages normally follows a valid hugepagsz or default_hugepagsz + * specification. If not, ignore the hugepages value. hugepages can also + * be the first huge page command line option in which case it implicitly + * specifies the number of huge pages for the default size. + */ +static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; if (!parsed_valid_hugepagesz) { - pr_warn("hugepages = %s preceded by " - "an unsupported hugepagesz, ignoring\n", s); + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return 0; } + /* - * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter + * yet, so this hugepages= parameter goes to the "default hstate". + * Otherwise, it goes with the previously parsed hugepagesz or + * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; @@ -3310,8 +3336,8 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); - return 1; + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); + return 0; } if (sscanf(s, "%lu", mhp) <= 0) @@ -3329,14 +3355,102 @@ static int __init hugetlb_nrpages_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_nrpages_setup); +__setup("hugepages=", hugepages_setup); -static int __init hugetlb_default_setup(char *s) +/* + * hugepagesz command line processing + * A specific huge page size can only be specified once with hugepagesz. + * hugepagesz is followed by hugepages on the command line. The global + * variable 'parsed_valid_hugepagesz' is used to determine if prior + * hugepagesz argument was valid. + */ +static int __init hugepagesz_setup(char *s) { - default_hstate_size = memparse(s, &s); + unsigned long size; + struct hstate *h; + + parsed_valid_hugepagesz = false; + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); + return 0; + } + + h = size_to_hstate(size); + if (h) { + /* + * hstate for this size already exists. This is normally + * an error, but is allowed if the existing hstate is the + * default hstate. More specifically, it is only allowed if + * the number of huge pages for the default hstate was not + * previously specified. + */ + if (!parsed_default_hugepagesz || h != &default_hstate || + default_hstate.max_huge_pages) { + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); + return 0; + } + + /* + * No need to call hugetlb_add_hstate() as hstate already + * exists. But, do set parsed_hstate so that a following + * hugepages= parameter will be applied to this hstate. + */ + parsed_hstate = h; + parsed_valid_hugepagesz = true; + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; return 1; } -__setup("default_hugepagesz=", hugetlb_default_setup); +__setup("hugepagesz=", hugepagesz_setup); + +/* + * default_hugepagesz command line input + * Only one instance of default_hugepagesz allowed on command line. + */ +static int __init default_hugepagesz_setup(char *s) +{ + unsigned long size; + + parsed_valid_hugepagesz = false; + if (parsed_default_hugepagesz) { + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); + return 0; + } + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); + return 0; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + parsed_default_hugepagesz = true; + default_hstate_idx = hstate_index(size_to_hstate(size)); + + /* + * The number of default huge pages (for this size) could have been + * specified as the first hugetlb parameter: hugepages=X. If so, + * then default_hstate_max_huge_pages is set. If the default huge + * page size is gigantic (>= MAX_ORDER), then the pages must be + * allocated here from bootmem allocator. + */ + if (default_hstate_max_huge_pages) { + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (hstate_is_gigantic(&default_hstate)) + hugetlb_hstate_alloc_pages(&default_hstate); + default_hstate_max_huge_pages = 0; + } + + return 1; +} +__setup("default_hugepagesz=", default_hugepagesz_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { @@ -5354,8 +5468,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * - * Return: Pointer to page table or swap entry (PUD or PMD) for - * address @addr, or NULL if a p*d_none() entry is encountered and the + * Return: Pointer to page table entry (PUD or PMD) for + * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ @@ -5364,8 +5478,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, { pgd_t *pgd; p4d_t *p4d; - pud_t *pud, pud_entry; - pmd_t *pmd, pmd_entry; + pud_t *pud; + pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -5375,22 +5489,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(p4d, addr); - pud_entry = READ_ONCE(*pud); - if (sz != PUD_SIZE && pud_none(pud_entry)) - return NULL; - /* hugepage or swap? */ - if (pud_huge(pud_entry) || !pud_present(pud_entry)) + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ return (pte_t *)pud; - - pmd = pmd_offset(pud, addr); - pmd_entry = READ_ONCE(*pmd); - if (sz != PMD_SIZE && pmd_none(pmd_entry)) + if (!pud_present(*pud)) return NULL; - /* hugepage or swap? */ - if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry)) - return (pte_t *)pmd; + /* must have a valid entry and size to go further */ - return NULL; + pmd = pmd_offset(pud, addr); + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/internal.h b/mm/internal.h index f762a34b0c57..9117bca90f4b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -127,10 +127,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole strucure * by a const pointer. @@ -140,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -224,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ @@ -529,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cd280afb246e..3f032487825b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,8 @@ enum scan_result { SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, @@ -47,7 +49,6 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, }; @@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, khugepaged_max_ptes_swap_store); +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, + khugepaged_max_ptes_shared_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -359,6 +389,7 @@ int __init khugepaged_init(void) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -512,27 +543,52 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -compound_nr(page)); unlock_page(page); putback_lru_page(page); } -static void release_pte_pages(pte_t *pte, pte_t *_pte) +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) { + struct page *page, *tmp; + while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); + + page = pte_page(pteval); + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && + !PageCompound(page)) + release_pte_page(page); + } + + list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { + list_del(&page->lru); + release_pte_page(page); } } +static bool is_refcount_suitable(struct page *page) +{ + int expected_refcount; + + expected_refcount = total_mapcount(page); + if (PageSwapCache(page)) + expected_refcount += compound_nr(page); + + return page_count(page) == expected_refcount; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) + pte_t *pte, + struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; @@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + if (PageCompound(page)) { + struct page *p; + page = compound_head(page); + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(p, compound_pagelist, lru) { + if (page == p) + goto next; + } + } /* * We can do it before isolate_lru_page because the @@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additinal pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } + if (!pte_write(pteval) && PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { /* - * Page is not in the swap cache. It can be collapsed - * into a THP. + * Page is in the swap cache and cannot be re-used. + * It cannot be collapsed into a THP. */ + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; } /* @@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + compound_nr(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + if (PageCompound(page)) + list_add_tail(&page->lru, compound_pagelist); +next: /* There should be enough young pte to collapse the page */ if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (pte_write(pteval)) + writable = true; } if (likely(writable)) { if (likely(referenced)) { @@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } out: - release_pte_pages(pte, _pte); + release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; @@ -643,13 +722,14 @@ out: static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, - spinlock_t *ptl) + spinlock_t *ptl, + struct list_head *compound_pagelist) { + struct page *src_page, *tmp; pte_t *_pte; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; - struct page *src_page; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); @@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, free_page_and_swap_cache(src_page); } } + + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); + } } static void khugepaged_alloc_sleep(void) @@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pgoff = linear_page_index(vma, address), }; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { @@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.pte--; pte_unmap(vmf.pte); + + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced) + int node, int referenced, int unmapped) { + LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; - struct mem_cgroup *memcg; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -974,15 +1059,15 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); down_read(&mm->mmap_sem); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -990,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1000,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm, * If it fails, we release mmap_sem and jump out_nolock. * Continuing to collapse causes inconsistency. */ - if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced)) { up_read(&mm->mmap_sem); goto out_nolock; } @@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); if (unlikely(!isolated)) { @@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1102,10 +1186,11 @@ static void collapse_huge_page(struct mm_struct *mm, out_up_write: up_write(&mm->mmap_sem); out_nolock: + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int ret = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; spinlock_t *ptl; @@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } + page = compound_head(page); + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * Here the check is racy it may see totmal_mapcount > refcount + * in some cases. + * For example, one process with one forked child process. + * The parent has the PMD split due to MADV_DONTNEED, then + * the child is trying unmap the whole PMD, but khugepaged + * may be scanning the parent between the child has + * PageDoubleMap flag cleared and dec the mapcount. So + * khugepaged may see total_mapcount > refcount. + * + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, mmu_notifier_test_young(vma->vm_mm, address)) referenced++; } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_LACK_REFERENCED_PAGE; - } - } else { + if (!writable) { result = SCAN_PAGE_RO; + } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, node, referenced); + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm, struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; - struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* This will be less messy when we use multi-index entries */ do { @@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg, true); result = SCAN_FAIL; goto out; } @@ -1741,12 +1839,9 @@ out_unlock: } if (nr_none) { - struct zone *zone = page_zone(new_page); - - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_node_page_state(zone->zone_pgdat, - NR_SHMEM, nr_none); + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } xa_locked: @@ -1784,15 +1879,9 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false, true); - - if (is_shmem) { + if (is_shmem) set_page_dirty(new_page); - lru_cache_add_anon(new_page); - } else { - lru_cache_add_file(new_page); - } - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); + lru_cache_add(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1839,13 +1928,14 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg, true); new_page->mapping = NULL; } unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } @@ -2084,6 +2174,8 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ + lru_add_drain_all(); + while (progress < pages) { if (!khugepaged_prealloc_page(&hpage, &wait)) break; diff --git a/mm/memblock.c b/mm/memblock.c index c79ba6f9920c..743659d88fc4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, { struct memblock_type *type = &memblock.memory; struct memblock_region *r; + int r_nid; while (++*idx < type->cnt) { r = &type->regions[*idx]; + r_nid = memblock_get_region_node(r); if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r->nid) + if (nid == MAX_NUMNODES || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (out_end_pfn) *out_end_pfn = PFN_DOWN(r->base + r->size); if (out_nid) - *out_nid = r->nid; + *out_nid = r_nid; } /** @@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); - return type->regions[mid].nid; + return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f973a025569b..5381afb23d58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -int do_swap_account __read_mostly; +bool cgroup_memory_noswap __read_mostly; #else -#define do_swap_account 0 +#define cgroup_memory_noswap 1 #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } #define THRESHOLDS_EVENTS_TARGET 128 @@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool compound, int nr_pages) + int nr_pages) { - /* - * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is - * counted as CACHE even if it's on ANON LRU. - */ - if (PageAnon(page)) - __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - else { - __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); - if (PageSwapBacked(page)) - __mod_memcg_state(memcg, NR_SHMEM, nr_pages); - } - - if (compound) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); - } - /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, * @page: the page * @pgdat: pgdat of the page * - * This function is only safe when following the LRU page isolation - * and putback protocol: the LRU lock must be held, and the page must - * either be PageLRU() or the caller must have isolated/allocated it. + * This function relies on page->mem_cgroup being stable - see the + * access rules in commit_charge(). */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { @@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg) */ seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * + (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * + (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * @@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_ANON_THPS) * + HPAGE_PMD_SIZE); +#endif for (i = 0; i < NR_LRU_LISTS; i++) seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), @@ -1981,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) */ struct mem_cgroup *lock_page_memcg(struct page *page) { + struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -2000,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = page->mem_cgroup; + memcg = head->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2008,7 +1987,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page->mem_cgroup) { + if (memcg != head->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2051,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) */ void unlock_page_memcg(struct page *page) { - __unlock_page_memcg(page->mem_cgroup); + struct page *head = compound_head(page); + + __unlock_page_memcg(head->mem_cgroup); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2659,6 +2640,7 @@ done_restock: return 0; } +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) @@ -2670,70 +2652,20 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } +#endif -static void lock_page_lru(struct page *page, int *isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - *isolated = 1; - } else - *isolated = 0; -} - -static void unlock_page_lru(struct page *page, int isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - if (isolated) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&pgdat->lru_lock); -} - -static void commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - int isolated; - VM_BUG_ON_PAGE(page->mem_cgroup, page); - /* - * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page - * may already be on some other mem_cgroup's LRU. Take care of it. - */ - if (lrucare) - lock_page_lru(page, &isolated); - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point: - * - * - the page is uncharged + * Any of the following ensures page->mem_cgroup stability: * - * - the page is off-LRU - * - * - an anonymous fault has exclusive page access, except for - * a locked page table - * - * - a page cache insertion, a swapin fault, or a migration - * have the page locked + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference */ page->mem_cgroup = memcg; - - if (lrucare) - unlock_page_lru(page, isolated); } #ifdef CONFIG_MEMCG_KMEM @@ -3070,8 +3002,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - - __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3354,8 +3284,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - val = memcg_page_state(memcg, MEMCG_CACHE) + - memcg_page_state(memcg, MEMCG_RSS); + val = memcg_page_state(memcg, NR_FILE_PAGES) + + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); } else { @@ -3743,7 +3673,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) + int nid, unsigned int lru_mask, bool tree) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; @@ -3754,13 +3684,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + if (tree) + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + else + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) + unsigned int lru_mask, + bool tree) { unsigned long nr = 0; enum lru_list lru; @@ -3768,7 +3702,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + if (tree) + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + else + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3788,34 +3725,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) }; const struct numa_stat *stat; int nid; - unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); - seq_printf(m, "%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + seq_printf(m, "%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + false)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, false)); seq_putc(m, '\n'); } for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - struct mem_cgroup *iter; - - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); - seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_node_nr_lru_pages( - iter, nid, stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + + seq_printf(m, "hierarchical_%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + true)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, true)); seq_putc(m, '\n'); } @@ -3824,9 +3755,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, + NR_FILE_PAGES, + NR_ANON_MAPPED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif NR_SHMEM, NR_FILE_MAPPED, NR_FILE_DIRTY, @@ -3837,7 +3770,9 @@ static const unsigned int memcg1_stats[] = { static const char *const memcg1_stat_names[] = { "cache", "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE "rss_huge", +#endif "shmem", "mapped_file", "dirty", @@ -3863,11 +3798,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state_local(memcg, memcg1_stats[i]) * - PAGE_SIZE); + nr = memcg_page_state_local(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -3913,23 +3853,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) { pg_data_t *pgdat; struct mem_cgroup_per_node *mz; - struct zone_reclaim_stat *rstat; - unsigned long recent_rotated[2] = {0, 0}; - unsigned long recent_scanned[2] = {0, 0}; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; } - seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); - seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); - seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); - seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); } #endif @@ -5364,8 +5298,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); - if (do_memsw_account()) - entry->val = ent.val; + entry->val = ent.val; return page; } @@ -5399,8 +5332,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_memsw_account()) - *entry = swp; + *entry = swp; page = find_get_page(swap_address_space(swp), swp_offset(swp)); } @@ -5431,10 +5363,8 @@ static int mem_cgroup_move_account(struct page *page, { struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned long flags; unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; - bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5452,30 +5382,47 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; - anon = PageAnon(page); - pgdat = page_pgdat(page); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - spin_lock_irqsave(&from->move_lock, flags); + lock_page_memcg(page); - if (!anon && page_mapped(page)) { - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); - } + if (PageAnon(page)) { + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (PageTransHuge(page)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } - /* - * move_lock grabbed above and caller set from->moving_account, so - * mod_memcg_page_state will serialize updates to PageDirty. - * So mapping should be stable for dirty pages. - */ - if (!anon && PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + } + } else { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); + + if (PageSwapBacked(page)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } + + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); + } + + if (PageDirty(page)) { + struct address_space *mapping = page_mapping(page); - if (mapping_cap_account_dirty(mapping)) { - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages); + if (mapping_cap_account_dirty(mapping)) { + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, + -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, + nr_pages); + } } } @@ -5485,22 +5432,30 @@ static int mem_cgroup_move_account(struct page *page, } /* + * All state has been migrated, let's switch to the new memcg. + * * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. + * is referenced, charged, isolated, and locked: we can't race + * with (un)charging, migration, LRU putback, or anything else + * that would rely on a stable page->mem_cgroup. + * + * Note that lock_page_memcg is a memcg lock, not a page lock, + * to save space. As soon as we switch page->mem_cgroup to a + * new memcg that isn't locked, the above state can change + * concurrently again. Make sure we're truly done with it. */ + smp_mb(); - /* caller should have done css_get */ - page->mem_cgroup = to; + page->mem_cgroup = to; /* caller should have done css_get */ - spin_unlock_irqrestore(&from->move_lock, flags); + __unlock_page_memcg(from); ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, compound, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, compound, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -6486,125 +6441,63 @@ out: } /** - * mem_cgroup_try_charge - try charging a page + * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @memcgp: charged memcg return - * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * - * Returns 0 on success, with *@memcgp pointing to the charged memcg. - * Otherwise, an error code is returned. - * - * After page->mapping has been set up, the caller must finalize the - * charge with mem_cgroup_commit_charge(). Or abort the transaction - * with mem_cgroup_cancel_charge() in case page instantiation fails. + * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) goto out; if (PageSwapCache(page)) { + swp_entry_t ent = { .val = page_private(page), }; + unsigned short id; + /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which + * already charged pages, too. page->mem_cgroup is protected + * by the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound_head(page)->mem_cgroup) goto out; - if (do_swap_account) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id = lookup_swap_cgroup_id(ent); - - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } + id = lookup_swap_cgroup_id(ent); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); } if (!memcg) memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); + if (ret) + goto out_put; - css_put(&memcg->css); -out: - *memcgp = memcg; - return ret; -} - -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) -{ - struct mem_cgroup *memcg; - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); - memcg = *memcgp; - mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); - return ret; -} - -/** - * mem_cgroup_commit_charge - commit a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @lrucare: page might be on LRU already - * @compound: charge the page as compound or small page - * - * Finalize a charge transaction started by mem_cgroup_try_charge(), - * after page->mapping has been set up. This must happen atomically - * as part of the page instantiation, i.e. under the page table lock - * for anonymous pages, under the page lock for page and swap cache. - * - * In addition, the page must not be on the LRU during the commit, to - * prevent racing with task migration. If it might be, use @lrucare. - * - * Use mem_cgroup_cancel_charge() to cancel the transaction instead. - */ -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - VM_BUG_ON_PAGE(!page->mapping, page); - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - commit_charge(page, memcg, lrucare); + commit_charge(page, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_memsw_account() && PageSwapCache(page)) { + if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -6613,42 +6506,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, */ mem_cgroup_uncharge_swap(entry, nr_pages); } -} -/** - * mem_cgroup_cancel_charge - cancel a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @compound: charge the page as compound or small page - * - * Cancel a charge transaction started by mem_cgroup_try_charge(). - */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - cancel_charge(memcg, nr_pages); +out_put: + css_put(&memcg->css); +out: + return ret; } struct uncharge_gather { struct mem_cgroup *memcg; + unsigned long nr_pages; unsigned long pgpgout; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_kmem; - unsigned long nr_huge; - unsigned long nr_shmem; struct page *dummy_page; }; @@ -6659,37 +6528,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, nr_pages); + page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); - __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); - __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); - __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); if (!mem_cgroup_is_root(ug->memcg)) - css_put_many(&ug->memcg->css, nr_pages); + css_put_many(&ug->memcg->css, ug->nr_pages); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) { + unsigned long nr_pages; + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && - !PageHWPoison(page) , page); if (!page->mem_cgroup) return; @@ -6708,23 +6572,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->memcg = page->mem_cgroup; } - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; + nr_pages = compound_nr(page); + ug->nr_pages += nr_pages; - if (PageTransHuge(page)) { - nr_pages = compound_nr(page); - ug->nr_huge += nr_pages; - } - if (PageAnon(page)) - ug->nr_anon += nr_pages; - else { - ug->nr_file += nr_pages; - if (PageSwapBacked(page)) - ug->nr_shmem += nr_pages; - } + if (!PageKmemcg(page)) { ug->pgpgout++; } else { - ug->nr_kmem += compound_nr(page); + ug->nr_kmem += nr_pages; __ClearPageKmemcg(page); } @@ -6761,8 +6615,7 @@ static void uncharge_list(struct list_head *page_list) * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_try_charge() and - * mem_cgroup_commit_charge(). + * Uncharge a page previously charged with mem_cgroup_charge(). */ void mem_cgroup_uncharge(struct page *page) { @@ -6785,7 +6638,7 @@ void mem_cgroup_uncharge(struct page *page) * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + * mem_cgroup_charge(). */ void mem_cgroup_uncharge_list(struct list_head *page_list) { @@ -6838,11 +6691,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, false); + commit_charge(newpage, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), - nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } @@ -7030,7 +6882,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_memsw_account()) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; memcg = page->mem_cgroup; @@ -7059,7 +6911,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg != swap_memcg) { + if (!cgroup_memory_noswap && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7072,8 +6924,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), - -nr_entries); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) @@ -7096,7 +6947,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; memcg = page->mem_cgroup; @@ -7112,7 +6963,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!mem_cgroup_is_root(memcg) && + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7140,14 +6991,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) { + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7163,7 +7011,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7180,7 +7028,7 @@ bool mem_cgroup_swap_full(struct page *page) if (vm_swap_full()) return true; - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = page->mem_cgroup; @@ -7198,22 +7046,15 @@ bool mem_cgroup_swap_full(struct page *page) return false; } -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - -static int __init enable_swap_account(char *s) +static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - really_do_swap_account = 1; + cgroup_memory_noswap = 0; else if (!strcmp(s, "0")) - really_do_swap_account = 0; + cgroup_memory_noswap = 1; return 1; } -__setup("swapaccount=", enable_swap_account); +__setup("swapaccount=", setup_swap_account); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -7310,7 +7151,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -static struct cftype memsw_cgroup_files[] = { +static struct cftype memsw_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -7339,13 +7180,16 @@ static struct cftype memsw_cgroup_files[] = { static int __init mem_cgroup_swap_init(void) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, - swap_files)); - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); - } + /* No memory control -> no swap control */ + if (mem_cgroup_disabled()) + cgroup_memory_noswap = true; + + if (cgroup_memory_noswap) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); + return 0; } subsys_initcall(mem_cgroup_swap_init); diff --git a/mm/memory.c b/mm/memory.c index a0e21e2c571e..7b70398f76a0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2645,7 +2645,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - struct mem_cgroup *memcg; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) @@ -2676,8 +2675,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); @@ -2712,7 +2712,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2752,7 +2751,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) page_copied = 1; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); - mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -3092,7 +3090,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; - struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; @@ -3133,10 +3130,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { + int err; + __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); - lru_cache_add_anon(page); + + /* Tell memcg to use swap ownership records */ + SetPageSwapCache(page); + err = mem_cgroup_charge(page, vma->vm_mm, + GFP_KERNEL); + ClearPageSwapCache(page); + if (err) + goto out_page; + + lru_cache_add(page); swap_readpage(page, true); } } else { @@ -3197,11 +3205,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = VM_FAULT_OOM; - goto out_page; - } + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. @@ -3249,11 +3253,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } @@ -3289,7 +3291,6 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3310,7 +3311,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; @@ -3365,9 +3365,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, - false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that @@ -3395,14 +3395,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3413,7 +3411,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); goto unlock; oom_free_page: @@ -3618,7 +3615,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment - * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on @@ -3629,8 +3625,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Return: %0 on success, %VM_FAULT_ code in case of error. */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page) +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3638,9 +3633,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - /* THP on COW? */ - VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -3667,7 +3659,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -3716,7 +3707,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) - ret = alloc_set_pte(vmf, vmf->memcg, page); + ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3876,11 +3867,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &vmf->memcg, false)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3898,7 +3889,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..926ec704e835 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -879,13 +879,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -1372,11 +1372,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 48ba9729062e..1965e2681877 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -927,10 +927,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err == 0) { - /* E.g. GUP interrupted by fatal signal */ - err = -EFAULT; - } else if (err > 0) { + if (err > 0) { err = page_to_nid(p); put_page(p); } diff --git a/mm/migrate.c b/mm/migrate.c index 846af96b84a5..7bfd0962149e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); + __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __dec_lruvec_state(old_lruvec, NR_SHMEM); + __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); @@ -2733,7 +2740,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2780,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -2826,7 +2832,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); @@ -2848,7 +2853,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..4daedf7b91f6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 843728e60f22..07ae77d97952 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include <linux/lockdep.h> #include <linux/nmi.h> #include <linux/psi.h> +#include <linux/padata.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; @@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason, pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page, return true; } -static void free_pages_check_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + return bad_reason; +} + +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int free_pages_check(struct page *page) +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } @@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; @@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_check(page); + return check_free_page(page); else return false; } @@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; -int __meminit early_pfn_to_nid(unsigned long pfn) +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { - static DEFINE_SPINLOCK(early_pfn_lock); + unsigned long start_pfn, end_pfn; int nid; - spin_lock(&early_pfn_lock); - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid < 0) - nid = first_online_node; - spin_unlock(&early_pfn_lock); + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } return nid; } -#endif +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0 && nid != node) - return false; - return true; -} + if (nid < 0) + nid = first_online_node; + spin_unlock(&early_pfn_lock); -#else -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; + return nid; } -#endif - +#endif /* CONFIG_NEED_MULTIPLE_NODES */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -1692,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1722,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1816,16 +1816,43 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, return nr_pages; } +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long spfn = 0, epfn = 0; unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1845,6 +1872,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1857,21 +1891,30 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. - */ - while (spfn < epfn) - nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); -zone_empty: - pgdat_resize_unlock(pgdat, &flags); + max_threads = deferred_page_init_max_threads(cpumask); + while (spfn < epfn) { + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); + } +zone_empty: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", - pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1909,17 +1952,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. */ @@ -1947,6 +1979,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) first_deferred_pfn = spfn; nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); /* We should only stop along section boundaries */ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) @@ -2092,31 +2125,14 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; - - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason, bad_flags); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* @@ -2609,7 +2625,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -2768,6 +2784,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; +#ifdef CONFIG_CMA + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (migratetype == MIGRATE_MOVABLE && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } +#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -3464,7 +3494,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3509,7 +3539,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3542,14 +3572,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3567,22 +3598,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3659,8 +3691,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3715,7 +3747,7 @@ retry: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3748,7 +3780,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3907,7 +3939,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4110,10 +4142,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4237,12 +4269,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4285,7 +4317,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4377,8 +4409,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4392,7 +4424,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4511,7 +4543,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4598,7 +4630,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4732,10 +4764,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; @@ -4771,7 +4803,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -5682,14 +5714,13 @@ static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { @@ -5901,7 +5932,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5917,27 +5947,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } -#ifdef CONFIG_SPARSEMEM -/* Skip PFNs that belong to non-present sections */ -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - const unsigned long section_nr = pfn_to_section_nr(++pfn); - - if (present_section_nr(section_nr)) - return pfn; - return section_nr_to_pfn(next_present_section_nr(section_nr)); -} -#else -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - return pfn++; -} -#endif - /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5977,14 +5989,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn); - continue; - } - if (!early_pfn_in_nid(pfn, nid)) { - pfn++; - continue; - } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6100,9 +6104,23 @@ static void __meminit zone_init_free_lists(struct zone *zone) } void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) + unsigned long zone, + unsigned long range_start_pfn) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); + unsigned long start_pfn, end_pfn; + unsigned long range_end_pfn = range_start_pfn + size; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + + if (end_pfn > start_pfn) { + size = end_pfn - start_pfn; + memmap_init_zone(size, nid, zone, start_pfn, + MEMMAP_EARLY, NULL); + } + } } static int zone_batchsize(struct zone *zone) @@ -6254,10 +6272,25 @@ void __init setup_per_cpu_pageset(void) { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif + for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); @@ -6300,57 +6333,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -6463,8 +6445,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6528,8 +6509,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6576,45 +6556,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __init zone_absent_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6622,17 +6566,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6932,10 +6880,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6952,30 +6898,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6983,6 +6924,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_memoryless_node(int nid) +{ + free_area_init_node(nid); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7066,8 +7012,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7131,24 +7075,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * @@ -7157,7 +7083,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* @@ -7210,7 +7136,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -7231,7 +7157,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); @@ -7246,7 +7172,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) } if (mem_below_4gb_not_mirrored) - pr_warn("This configuration results in unmirrored kernel memory."); + pr_warn("This configuration results in unmirrored kernel memory.\n"); goto out2; } @@ -7411,8 +7337,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7424,10 +7359,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7436,14 +7372,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } @@ -7496,8 +7438,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7562,8 +7503,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); @@ -7686,13 +7625,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) -{ - init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); -} - static int page_alloc_cpu_dead(unsigned int cpu) { @@ -7810,9 +7742,10 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; + if (!sysctl_lowmem_reserve_ratio[idx] || + !zone_managed_pages(lower_zone)) { lower_zone->lowmem_reserve[j] = 0; + continue; } else { lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; @@ -7877,9 +7810,9 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -8064,7 +7997,15 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; } @@ -8392,7 +8333,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3d7c01e76efc..d18f0e1b6792 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } diff --git a/mm/rmap.c b/mm/rmap.c index f79a206b271a..ad4a0fdcc94c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first; + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ @@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; } /* @@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* diff --git a/mm/shmem.c b/mm/shmem.c index bd8840082c94..ea95a3e46fbb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,11 +605,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -621,6 +623,18 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + if (!PageSwapCache(page)) { + error = mem_cgroup_charge(page, charge_mm, gfp); + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + cgroup_throttle_swaprate(page, gfp); + do { void *entry; xas_lock_irq(&xas); @@ -641,19 +655,22 @@ next: __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); + return error; } /* @@ -670,8 +687,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -1578,8 +1595,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1591,8 +1609,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1619,7 +1636,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1664,31 +1680,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1734,7 +1731,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; @@ -1859,25 +1855,12 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) { - if (PageTransHuge(page)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced += compound_nr(page); @@ -2314,7 +2297,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; spinlock_t *ptl; void *page_kaddr; struct page *page; @@ -2364,16 +2346,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); + goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2394,13 +2370,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; + goto out_release_unlock; ret = -EEXIST; if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_release_unlock; - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced++; @@ -2419,12 +2395,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = 0; out: return ret; -out_release_uncharge_unlock: +out_release_unlock: pte_unmap_unlock(dst_pte, ptl); ClearPageDirty(page); delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); out_release: unlock_page(page); put_page(page); diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slub.c b/mm/slub.c index 336be3224092..d52487919278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1938,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1967,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); @@ -5835,8 +5835,10 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) + if (err) { + kobject_put(&s->kobj); goto out; + } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) diff --git a/mm/swap.c b/mm/swap.c index 0ac463d44cff..dbcab84c6fce 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -102,8 +102,6 @@ static void __put_single_page(struct page *page) static void __put_compound_page(struct page *page) { - compound_page_dtor *dtor; - /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set @@ -112,8 +110,7 @@ static void __put_compound_page(struct page *page) */ if (!PageHuge(page)) __page_cache_release(page); - dtor = get_compound_page_dtor(page); - (*dtor)(page); + destroy_compound_page(page); } void __put_page(struct page *page) @@ -244,7 +241,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec, page_lru(page)); - (*pgmoved)++; + (*pgmoved) += hpage_nr_pages(page); } } @@ -281,22 +278,49 @@ void rotate_reclaimable_page(struct page *page) } } -static void update_page_reclaim_stat(struct lruvec *lruvec, - int file, int rotated) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + do { + unsigned long lrusize; + + /* Record cost event */ + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; - reclaim_stat->recent_scanned[file]++; - if (rotated) - reclaim_stat->recent_rotated[file]++; + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + } while ((lruvec = parent_lruvec(lruvec))); +} + +void lru_note_cost_page(struct page *page) +{ + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + page_is_file_lru(page), hpage_nr_pages(page)); } static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); @@ -304,8 +328,9 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_activate(page); - __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(lruvec, file, 1); + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, + nr_pages); } } @@ -426,37 +451,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -static void __lru_cache_add(struct page *page) -{ - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); - get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) - __pagevec_lru_add(pvec); - local_unlock(&lru_pvecs.lock); -} - -/** - * lru_cache_add_anon - add a page to the page lists - * @page: the page to add - */ -void lru_cache_add_anon(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} - -void lru_cache_add_file(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} -EXPORT_SYMBOL(lru_cache_add_file); - /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. @@ -468,10 +462,19 @@ EXPORT_SYMBOL(lru_cache_add_file); */ void lru_cache_add(struct page *page) { + struct pagevec *pvec; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); - __lru_cache_add(page); + + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); + local_unlock(&lru_pvecs.lock); } +EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_active_or_unevictable @@ -527,8 +530,9 @@ void lru_cache_add_active_or_unevictable(struct page *page, static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { - int lru, file; + int lru; bool active; + int nr_pages = hpage_nr_pages(page); if (!PageLRU(page)) return; @@ -541,7 +545,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -562,28 +565,31 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * We moves tha page into tail of inactive. */ add_page_to_lru_list_tail(page, lruvec, lru); - __count_vm_event(PGROTATED); + __count_vm_events(PGROTATED, nr_pages); } - if (active) - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + if (active) { + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec, lru); - __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); } } @@ -593,6 +599,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); @@ -606,9 +613,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); - count_memcg_page_event(page, PGLAZYFREE); - update_page_reclaim_stat(lruvec, 1, 0); + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, + nr_pages); } } @@ -932,8 +939,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - const int file = 0; - VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); @@ -959,9 +964,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, add_page_to_lru_list_tail(page_tail, lruvec, page_lru(page_tail)); } - - if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -970,6 +972,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); + int nr_pages = hpage_nr_pages(page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1004,16 +1007,14 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_lru(page), - PageActive(page)); if (was_unevictable) - count_vm_event(UNEVICTABLE_PGRESCUED); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) - count_vm_event(UNEVICTABLE_PGCULLED); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } add_page_to_lru_list(page, lruvec, lru); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 45affaef3bc6..7f34343c075a 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); @@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return; - mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; diff --git a/mm/swap_state.c b/mm/swap_state.c index 8238954ae781..9d20b00627af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,12 +360,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page = NULL, *new_page = NULL; struct swap_info_struct *si; - int err; + struct page *page; + *new_page_allocated = false; - do { + for (;;) { + int err; /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling @@ -373,12 +374,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ si = get_swap_device(entry); if (!si) - break; - found_page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + return NULL; + page = find_get_page(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (found_page) - break; + if (page) + return page; /* * Just skip read ahead for unused swap slot. @@ -389,54 +390,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - break; + return NULL; /* - * Get a new page to read into from swap. + * Get a new page to read into from swap. Allocate it now, + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will + * cause any racers to loop around until we add it to cache. */ - if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); - if (!new_page) - break; /* Out of memory */ - } + page = alloc_page_vma(gfp_mask, vma, addr); + if (!page) + return NULL; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { - /* - * We might race against get_swap_page() and stumble - * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet. - */ - cond_resched(); - continue; - } else if (err) /* swp entry is obsolete ? */ + if (!err) break; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (likely(!err)) { - /* Initiate read into locked page */ - SetPageWorkingset(new_page); - lru_cache_add_anon(new_page); - *new_page_allocated = true; - return new_page; - } - __ClearPageLocked(new_page); + put_page(page); + if (err != -EEXIST) + return NULL; + /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. + * We might race against __delete_from_swap_cache(), and + * stumble across a swap_map entry whose SWAP_HAS_CACHE + * has not yet been cleared. Or race against another + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE + * in swap_map, but not yet added its page to swap cache. */ - put_swap_page(new_page, entry); - } while (err != -ENOMEM); + cond_resched(); + } + + /* + * The swap entry is ours to swap in. Prepare the new page. + */ + + __SetPageLocked(page); + __SetPageSwapBacked(page); + + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + put_swap_page(page, entry); + goto fail_unlock; + } + + if (mem_cgroup_charge(page, NULL, gfp_mask)) { + delete_from_swap_cache(page); + goto fail_unlock; + } + + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + + /* Caller will initiate read into locked page */ + SetPageWorkingset(page); + lru_cache_add(page); + *new_page_allocated = true; + return page; - if (new_page) - put_page(new_page); - return found_page; +fail_unlock: + unlock_page(page); + put_page(page); + return NULL; } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 63ac67208453..a3d191e205f2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1892,7 +1892,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { struct page *swapcache; - struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1902,15 +1901,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = -ENOMEM; - goto out_nolock; - } - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1922,10 +1914,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1936,7 +1926,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); -out_nolock: if (page != swapcache) { unlock_page(page); put_page(page); @@ -3799,11 +3788,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask) +void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; - if (!(gfp_mask & __GFP_IO) || !memcg) + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) @@ -3817,11 +3807,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], - avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), - true); + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); break; } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 512576e171ce..7f5194046b01 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; @@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -138,7 +136,6 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg, false); out_release: put_page(page); goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index b2f5deb3603c..3792dd19788c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,12 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -161,7 +167,7 @@ struct scan_control { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; /* @@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - enum ttu_flags ttu_flags, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - unsigned nr_reclaimed = 0; - unsigned pgactivate = 0; + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && PageSwapBacked(page)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } @@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: + stat->nr_pageout += hpage_nr_pages(page); + if (PageWriteback(page)) goto keep; if (PageDirty(page)) @@ -1438,7 +1450,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; @@ -1483,7 +1495,7 @@ keep: return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { @@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - struct reclaim_stat dummy_stat; - unsigned long ret; + struct reclaim_stat stat; + unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* @@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, if (!nr_zone_taken[zid]) continue; - __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#endif + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } @@ -1859,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1899,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, { LIST_HEAD(page_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { @@ -1929,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1945,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); + move_pages_to_lru(lruvec, &page_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; - - move_pages_to_lru(lruvec, &page_list); - - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&pgdat->lru_lock); @@ -2001,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2015,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); @@ -2042,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { - nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2053,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { + nr_rotated += hpage_nr_pages(page); list_add(&page->lru, &l_active); continue; } @@ -2067,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * Move pages back to the lru list. */ spin_lock_irq(&pgdat->lru_lock); - /* - * Count referenced pages from currently used mappings as rotated, - * even though only some of them are actually re-activated. This - * helps balance scan pressure between file and anonymous pages in - * get_scan_count. - */ - reclaim_stat->recent_rotated[file] += nr_rotated; nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2095,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; @@ -2229,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file; unsigned long ap, fp; enum lru_list lru; @@ -2286,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - - /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = swappiness; - file_prio = 200 - anon_prio; - /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). * - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. * - * anon in [0], file in [1] + * With swappiness at 100, anon and file have equal IO cost. */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; - spin_lock_irq(&pgdat->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; - } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. - */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; - - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&pgdat->lru_lock); + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; - denominator = ap + fp + 1; + denominator = ap + fp; out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); @@ -2566,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator - * calls try_to_compact_zone() that it will have enough free pages to succeed. + * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, @@ -2697,6 +2680,14 @@ again: nr_scanned = sc->nr_scanned; /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ @@ -3131,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3385,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3397,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3411,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3423,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3461,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3483,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3547,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3575,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3593,7 +3585,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3623,7 +3615,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3723,7 +3715,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3738,7 +3730,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3756,22 +3748,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3788,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3801,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3827,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3869,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3893,22 +3886,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3929,9 +3924,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3949,7 +3945,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3961,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3974,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3984,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 03987490ea96..a7db29f7e5f7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1203,6 +1203,10 @@ const char * const vmstat_text[] = { "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -1592,6 +1596,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->present_pages, zone_managed_pages(zone)); + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); @@ -1599,12 +1609,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_putc(m, ')'); - /* If unpopulated, no other information is useful */ - if (!populated_zone(zone)) { - seq_putc(m, '\n'); - return; - } - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", zone_stat_name(i), zone_page_state(zone, i)); diff --git a/mm/workingset.c b/mm/workingset.c index 474186b76ced..d481ea452eeb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow) struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; + unsigned long workingset_size; struct pglist_data *pgdat; - unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; @@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->inactive_age); - active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow) /* * Compare the distance to the existing workingset size. We - * don't act on pages that couldn't stay resident even if all - * the memory was available to the page cache. + * don't activate pages that couldn't stay resident even if + * all the memory was available to the page cache. Whether + * cache can compete with anon or not depends on having swap. */ - if (refault_distance > active_file) + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + workingset_size += lruvec_page_state(eviction_lruvec, + NR_ACTIVE_ANON); + } + if (refault_distance > workingset_size) goto out; SetPageActive(page); @@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow) /* Page was active prior to eviction */ if (workingset) { SetPageWorkingset(page); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } out: |