diff options
Diffstat (limited to 'mm')
57 files changed, 7798 insertions, 4697 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3e2daef3c946..c0837845c17c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -440,6 +440,14 @@ choice endchoice # +# We don't deposit page tables on file THP mapping, +# but Power makes use of them to address MMU quirk. +# +config TRANSPARENT_HUGE_PAGECACHE + def_bool y + depends on TRANSPARENT_HUGEPAGE && !PPC + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM @@ -673,7 +681,7 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" if EXPERT + bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP diff --git a/mm/Makefile b/mm/Makefile index 78c6f7dedb83..fc059666c760 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -74,7 +74,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ed173b8ae8f2..efe237742074 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout) EXPORT_SYMBOL(congestion_wait); /** - * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes - * @zone: A zone to check if it is heavily congested + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes + * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * In the event of a congested backing_dev (any backing_dev) and the given - * @zone has experienced recent congestion, this waits for up to @timeout + * @pgdat has experienced recent congestion, this waits for up to @timeout * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield + * In the absence of pgdat congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct zone *zone, int sync, long timeout) +long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) /* * If there is no congestion, or heavy congestion is not being - * encountered in the current zone, yield if necessary instead + * encountered in the current pgdat, yield if necessary instead * of sleeping on the congestion queue */ if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(ZONE_CONGESTED, &zone->flags)) { + !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { cond_resched(); + /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 57b3e9bd6bc5..da91df50ba31 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) */ if (trylock_page(page)) { #ifdef CONFIG_BALLOON_COMPACTION - if (!PagePrivate(page)) { + if (PageIsolated(page)) { /* raced with isolation */ unlock_page(page); continue; @@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -static inline void __isolate_balloon_page(struct page *page) +bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - ClearPagePrivate(page); list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + return true; } -static inline void __putback_balloon_page(struct page *page) +void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - SetPagePrivate(page); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -/* __isolate_lru_page() counterpart for a ballooned page */ -bool balloon_page_isolate(struct page *page) -{ - /* - * Avoid burning cycles with pages that are yet under __free_pages(), - * or just got freed under us. - * - * In case we 'win' a race for a balloon page being freed under us and - * raise its refcount preventing __free_pages() from doing its job - * the put_page() at the end of this block will take care of - * release this page, thus avoiding a nasty leakage. - */ - if (likely(get_page_unless_zero(page))) { - /* - * As balloon pages are not isolated from LRU lists, concurrent - * compaction threads can race against page migration functions - * as well as race against the balloon driver releasing a page. - * - * In order to avoid having an already isolated balloon page - * being (wrongly) re-isolated while it is under migration, - * or to avoid attempting to isolate pages being released by - * the balloon driver, lets be sure we have the page lock - * before proceeding with the balloon page isolation steps. - */ - if (likely(trylock_page(page))) { - /* - * A ballooned page, by default, has PagePrivate set. - * Prevent concurrent compaction threads from isolating - * an already isolated balloon page by clearing it. - */ - if (balloon_page_movable(page)) { - __isolate_balloon_page(page); - unlock_page(page); - return true; - } - unlock_page(page); - } - put_page(page); - } - return false; -} - -/* putback_lru_page() counterpart for a ballooned page */ -void balloon_page_putback(struct page *page) -{ - /* - * 'lock_page()' stabilizes the page and prevents races against - * concurrent isolation threads attempting to re-isolate it. - */ - lock_page(page); - - if (__is_movable_balloon_page(page)) { - __putback_balloon_page(page); - /* drop the extra ref count taken for page isolation */ - put_page(page); - } else { - WARN_ON(1); - dump_page(page, "not movable balloon page"); - } - unlock_page(page); -} /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) +int balloon_page_migrate(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); - int rc = -EAGAIN; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - if (WARN_ON(!__is_movable_balloon_page(page))) { - dump_page(page, "not movable balloon page"); - return rc; - } + return balloon->migratepage(balloon, newpage, page, mode); +} - if (balloon && balloon->migratepage) - rc = balloon->migratepage(balloon, newpage, page, mode); +const struct address_space_operations balloon_aops = { + .migratepage = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; +EXPORT_SYMBOL_GPL(balloon_aops); - return rc; -} #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/compaction.c b/mm/compaction.c index 79bfe0e06907..9affb2908304 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -15,11 +15,11 @@ #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> -#include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/page_owner.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -65,13 +65,27 @@ static unsigned long release_freepages(struct list_head *freelist) static void map_pages(struct list_head *list) { - struct page *page; + unsigned int i, order, nr_pages; + struct page *page, *next; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + + order = page_private(page); + nr_pages = 1 << order; - list_for_each_entry(page, list, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); - kasan_alloc_pages(page, 0); + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); + + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } } + + list_splice(&tmp_list, list); } static inline bool migrate_async_suitable(int migratetype) @@ -81,6 +95,44 @@ static inline bool migrate_async_suitable(int migratetype) #ifdef CONFIG_COMPACTION +int PageMovable(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (!__PageMovable(page)) + return 0; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) + return 1; + + return 0; +} +EXPORT_SYMBOL(PageMovable); + +void __SetPageMovable(struct page *page, struct address_space *mapping) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__SetPageMovable); + +void __ClearPageMovable(struct page *page) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + /* + * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE + * flag so that VM can catch up released page by driver after isolation. + * With it, VM migration doesn't try to put it back. + */ + page->mapping = (void *)((unsigned long)page->mapping & + PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__ClearPageMovable); + /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -279,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, { if (cc->mode == MIGRATE_ASYNC) { if (!spin_trylock_irqsave(lock, *flags)) { - cc->contended = COMPACT_CONTENDED_LOCK; + cc->contended = true; return false; } } else { @@ -313,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock, } if (fatal_signal_pending(current)) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } cond_resched(); @@ -342,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc) /* async compaction aborts if contended */ if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } @@ -368,12 +420,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; + unsigned int order; cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { - int isolated, i; + int isolated; struct page *page = cursor; /* @@ -439,17 +492,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, goto isolate_fail; } - /* Found a free page, break it into order-0 pages */ - isolated = split_free_page(page); + /* Found a free page, will break it into order-0 pages */ + order = page_order(page); + isolated = __isolate_free_page(page, order); if (!isolated) break; + set_page_private(page, order); total_isolated += isolated; cc->nr_freepages += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; - } + list_add_tail(&page->lru, freelist); + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; break; @@ -568,7 +621,7 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(&freelist); if (pfn < end_pfn) { @@ -593,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -602,12 +655,12 @@ static bool too_many_isolated(struct zone *zone) { unsigned long active, inactive, isolated; - inactive = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_ANON); - active = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_FILE) + - zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); + active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + + node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -670,7 +723,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { - bool is_lru; if (skip_on_failure && low_pfn >= next_skip_pfn) { /* @@ -700,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * if contended. */ if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&zone->lru_lock, flags, + && compact_unlock_should_abort(zone_lru_lock(zone), flags, &locked, cc)) break; @@ -733,21 +785,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Check may be lockless but that's ok as we recheck later. - * It's possible to migrate LRU pages and balloon pages - * Skip any other type of page - */ - is_lru = PageLRU(page); - if (!is_lru) { - if (unlikely(balloon_page_movable(page))) { - if (balloon_page_isolate(page)) { - /* Successfully isolated */ - goto isolate_success; - } - } - } - - /* * Regardless of being on LRU, compound pages such as THP and * hugetlbfs are not to be compacted. We can potentially save * a lot of iterations if we skip them at once. The check is @@ -763,8 +800,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } - if (!is_lru) + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU and non-lru movable pages. + * Skip any other type of page + */ + if (!PageLRU(page)) { + /* + * __PageMovable can return false positive so we need + * to verify it under page_lock. + */ + if (unlikely(__PageMovable(page)) && + !PageIsolated(page)) { + if (locked) { + spin_unlock_irqrestore(zone_lru_lock(zone), + flags); + locked = false; + } + + if (isolate_movable_page(page, isolate_mode)) + goto isolate_success; + } + goto isolate_fail; + } /* * Migration will fail if an anonymous page is pinned in memory, @@ -777,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_trylock_irqsave(&zone->lru_lock, + locked = compact_trylock_irqsave(zone_lru_lock(zone), &flags, cc); if (!locked) break; @@ -797,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) @@ -840,7 +899,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } acct_isolated(zone, cc); @@ -868,7 +927,7 @@ isolate_fail: low_pfn = end_pfn; if (locked) - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); /* * Update the pageblock-skip information and cached scanner pfn, @@ -1009,8 +1068,6 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; - /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need @@ -1034,40 +1091,34 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); - /* If isolation failed early, do not continue needlessly */ - if (!isolated && isolate_start_pfn < block_end_pfn && - cc->nr_migratepages > cc->nr_freepages) - break; + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, + freelist, false); /* - * If we isolated enough freepages, or aborted due to async - * compaction being contended, terminate the loop. - * Remember where the free scanner should restart next time, - * which is where isolate_freepages_block() left off. - * But if it scanned the whole pageblock, isolate_start_pfn - * now points at block_end_pfn, which is the start of the next - * pageblock. - * In that case we will however want to restart at the start - * of the previous pageblock. + * If we isolated enough freepages, or aborted due to lock + * contention, terminate. */ if ((cc->nr_freepages >= cc->nr_migratepages) || cc->contended) { - if (isolate_start_pfn >= block_end_pfn) + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ isolate_start_pfn = block_start_pfn - pageblock_nr_pages; + } break; - } else { + } else if (isolate_start_pfn < block_end_pfn) { /* - * isolate_freepages_block() should not terminate - * prematurely unless contended, or isolated enough + * If isolation failed early, do not continue + * needlessly. */ - VM_BUG_ON(isolate_start_pfn < block_end_pfn); + break; } } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(freelist); /* @@ -1149,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | - (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); + (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* * Start at where we last stopped, or beginning of the zone as @@ -1568,14 +1619,11 @@ out: trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); - if (ret == COMPACT_CONTENDED) - ret = COMPACT_PARTIAL; - return ret; } static enum compact_result compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, int *contended, + gfp_t gfp_mask, enum compact_priority prio, unsigned int alloc_flags, int classzone_idx) { enum compact_result ret; @@ -1585,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .order = order, .gfp_mask = gfp_mask, .zone = zone, - .mode = mode, + .mode = (prio == COMPACT_PRIO_ASYNC) ? + MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, @@ -1598,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *contended = cc.contended; return ret; } @@ -1611,50 +1659,38 @@ int sysctl_extfrag_threshold = 500; * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation * @mode: The migration mode for async, sync light, or sync migration - * @contended: Return value that determines if compaction was aborted due to - * need_resched() or lock contention * * This is the main entry point for direct page compaction. */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + enum compact_priority prio) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ - - *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ - if (!order || !may_enter_fs || !may_perform_io) + if (!may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; - trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { enum compact_result status; - int zone_contended; if (compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } - status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended, alloc_flags, - ac_classzone_idx(ac)); + status = compact_zone_order(zone, order, gfp_mask, prio, + alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* - * It takes at least one zone that wasn't lock contended - * to clear all_zones_contended. - */ - all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), @@ -1666,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, * succeeds in this zone. */ compaction_defer_reset(zone, order, false); - /* - * It is possible that async compaction aborted due to - * need_resched() and the watermarks were ok thanks to - * somebody else freeing memory. The allocation can - * however still fail so we better signal the - * need_resched() contention anyway (this will not - * prevent the allocation attempt). - */ - if (zone_contended == COMPACT_CONTENDED_SCHED) - *contended = COMPACT_CONTENDED_SCHED; - goto break_loop; + break; } - if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || - status == COMPACT_PARTIAL_SKIPPED)) { + if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up * succeeding after all, it will be reset. */ defer_compaction(zone, order); - } /* * We might have stopped compacting due to need_resched() in * async compaction, or due to a fatal signal detected. In that - * case do not try further zones and signal need_resched() - * contention. - */ - if ((zone_contended == COMPACT_CONTENDED_SCHED) - || fatal_signal_pending(current)) { - *contended = COMPACT_CONTENDED_SCHED; - goto break_loop; - } - - continue; -break_loop: - /* - * We might not have tried all the zones, so be conservative - * and assume they are not all lock contended. + * case do not try further zones */ - all_zones_contended = 0; - break; + if ((prio == COMPACT_PRIO_ASYNC && need_resched()) + || fatal_signal_pending(current)) + break; } - /* - * If at least one zone wasn't deferred or skipped, we report if all - * zones that were tried were lock contended. - */ - if (rc > COMPACT_INACTIVE && all_zones_contended) - *contended = COMPACT_CONTENDED_LOCK; - return rc; } diff --git a/mm/filemap.c b/mm/filemap.c index 6d92935dcf71..3083ded98b15 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -95,8 +95,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) - * ->zone.lru_lock (follow_page->mark_page_accessed) - * ->zone.lru_lock (check_pte_range->isolate_lru_page) + * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) + * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) @@ -114,14 +114,14 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; + int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); - VM_BUG_ON(!PageLocked(page)); - - node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, - shadow); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(nr != 1 && shadow, page); if (shadow) { - mapping->nrexceptional++; + mapping->nrexceptional += nr; /* * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing @@ -130,31 +130,38 @@ static void page_cache_tree_delete(struct address_space *mapping, */ smp_wmb(); } - mapping->nrpages--; + mapping->nrpages -= nr; - if (!node) - return; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) + for (i = 0; i < nr; i++) { + node = radix_tree_replace_clear_tags(&mapping->page_tree, + page->index + i, shadow); + if (!node) { + VM_BUG_ON_PAGE(nr != 1, page); return; + } - /* - * Track node that only contains shadow entries. DAX mappings contain - * no shadow entries and may contain other exceptional entries so skip - * those. - * - * Avoid acquiring the list_lru lock if already tracked. The - * list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, &node->private_list); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + continue; + + /* + * Track node that only contains shadow entries. DAX mappings + * contain no shadow entries and may contain other exceptional + * entries so skip those. + * + * Avoid acquiring the list_lru lock if already tracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!dax_mapping(mapping) && !workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, + &node->private_list); + } } } @@ -166,6 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping, void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; + int nr = hpage_nr_pages(page); trace_mm_filemap_delete_from_page_cache(page); /* @@ -178,6 +186,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) else cleancache_invalidate_page(mapping, page); + VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_mapped(page), page); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { int mapcount; @@ -209,9 +218,14 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(page)) - __dec_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __dec_zone_page_state(page, NR_SHMEM); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageSwapBacked(page)) { + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + if (PageTransHuge(page)) + __dec_node_page_state(page, NR_SHMEM_THPS); + } else { + VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); + } /* * At this point page must be either written or cleaned by truncate. @@ -235,9 +249,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) */ void delete_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); @@ -250,7 +263,13 @@ void delete_from_page_cache(struct page *page) if (freepage) freepage(page); - put_page(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } } EXPORT_SYMBOL(delete_from_page_cache); @@ -550,9 +569,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) * hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(new)) - __inc_zone_page_state(new, NR_FILE_PAGES); + __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) - __inc_zone_page_state(new, NR_SHMEM); + __inc_node_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); @@ -659,7 +678,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) - __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_node_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); @@ -1054,7 +1073,7 @@ EXPORT_SYMBOL(page_cache_prev_hole); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; - struct page *page; + struct page *head, *page; rcu_read_lock(); repeat: @@ -1074,16 +1093,24 @@ repeat: */ goto out; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* * Has the page moved? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - put_page(page); + put_page(head); goto repeat; } } @@ -1119,12 +1146,12 @@ repeat: if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(page_mapping(page) != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); } return page; } @@ -1256,7 +1283,7 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1273,12 +1300,20 @@ repeat: */ goto export; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } export: @@ -1319,7 +1354,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1338,12 +1373,19 @@ repeat: continue; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1380,7 +1422,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); /* The hole, there no reason to continue */ @@ -1400,12 +1442,19 @@ repeat: break; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1414,7 +1463,7 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != iter.index) { + if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { put_page(page); break; } @@ -1452,7 +1501,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1477,12 +1526,19 @@ repeat: continue; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -1526,7 +1582,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { - struct page *page; + struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1544,12 +1600,20 @@ repeat: */ goto export; } - if (!page_cache_get_speculative(page)) + + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } export: @@ -2129,21 +2193,21 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = vma->vm_file; + struct file *file = fe->vma->vm_file; struct address_space *mapping = file->f_mapping; + pgoff_t last_pgoff = start_pgoff; loff_t size; - struct page *page; - unsigned long address = (unsigned long) vmf->virtual_address; - unsigned long addr; - pte_t *pte; + struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { - if (iter.index > vmf->max_pgoff) + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start_pgoff) { + if (iter.index > end_pgoff) break; repeat: page = radix_tree_deref_slot(slot); @@ -2157,12 +2221,19 @@ repeat: goto next; } - if (!page_cache_get_speculative(page)) + head = compound_head(page); + if (!page_cache_get_speculative(head)) goto repeat; + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } + /* Has the page moved? */ if (unlikely(page != *slot)) { - put_page(page); + put_page(head); goto repeat; } @@ -2180,14 +2251,15 @@ repeat: if (page->index >= size >> PAGE_SHIFT) goto unlock; - pte = vmf->pte + page->index - vmf->pgoff; - if (!pte_none(*pte)) - goto unlock; - if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + + fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (fe->pte) + fe->pte += iter.index - last_pgoff; + last_pgoff = iter.index; + if (alloc_set_pte(fe, NULL, page)) + goto unlock; unlock_page(page); goto next; unlock: @@ -2195,7 +2267,10 @@ unlock: skip: put_page(page); next: - if (iter.index == vmf->max_pgoff) + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*fe->pmd)) + break; + if (iter.index == end_pgoff) break; } rcu_read_unlock(); diff --git a/mm/frontswap.c b/mm/frontswap.c index 27a9924caf61..fec8b5044040 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -20,6 +20,8 @@ #include <linux/frontswap.h> #include <linux/swapfile.h> +DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); + /* * frontswap_ops are added by frontswap_register_ops, and provide the * frontswap "backend" implementation functions. Multiple implementations @@ -139,6 +141,8 @@ void frontswap_register_ops(struct frontswap_ops *ops) ops->next = frontswap_ops; } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + static_branch_inc(&frontswap_enabled_key); + spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (si->frontswap_map) @@ -189,7 +193,7 @@ void __frontswap_init(unsigned type, unsigned long *map) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - BUG_ON(sis == NULL); + VM_BUG_ON(sis == NULL); /* * p->frontswap is a bitmap that we MUST have to figure out which page @@ -248,15 +252,9 @@ int __frontswap_store(struct page *page) pgoff_t offset = swp_offset(entry); struct frontswap_ops *ops; - /* - * Return if no backend registed. - * Don't need to inc frontswap_failed_stores here. - */ - if (!frontswap_ops) - return -1; - - BUG_ON(!PageLocked(page)); - BUG_ON(sis == NULL); + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); /* * If a dup, we must remove the old page first; we can't leave the @@ -303,11 +301,10 @@ int __frontswap_load(struct page *page) pgoff_t offset = swp_offset(entry); struct frontswap_ops *ops; - if (!frontswap_ops) - return -1; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); - BUG_ON(!PageLocked(page)); - BUG_ON(sis == NULL); if (!__frontswap_test(sis, offset)) return -1; @@ -337,10 +334,9 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - if (!frontswap_ops) - return; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); - BUG_ON(sis == NULL); if (!__frontswap_test(sis, offset)) return; @@ -360,10 +356,9 @@ void __frontswap_invalidate_area(unsigned type) struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; - if (!frontswap_ops) - return; + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); - BUG_ON(sis == NULL); if (sis->frontswap_map == NULL) return; @@ -279,6 +279,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, spin_unlock(ptl); ret = 0; split_huge_pmd(vma, pmd, address); + if (pmd_trans_unstable(pmd)) + ret = -EBUSY; } else { get_page(page); spin_unlock(ptl); @@ -286,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, ret = split_huge_page(page); unlock_page(page); put_page(page); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); } return ret ? ERR_PTR(ret) : @@ -350,7 +354,6 @@ unmap: static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { - struct mm_struct *mm = vma->vm_mm; unsigned int fault_flags = 0; int ret; @@ -375,7 +378,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_TRIED; } - ret = handle_mm_fault(mm, vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -690,7 +693,7 @@ retry: if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; - ret = handle_mm_fault(mm, vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ed58530f695..2373f0a7d340 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,7 +18,6 @@ #include <linux/mm_inline.h> #include <linux/swapops.h> #include <linux/dax.h> -#include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/pfn_t.h> @@ -30,39 +29,12 @@ #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" -enum scan_result { - SCAN_FAIL, - SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_EXCEED_NONE_PTE, - SCAN_PTE_NON_PRESENT, - SCAN_PAGE_RO, - SCAN_NO_REFERENCED_PAGE, - SCAN_PAGE_NULL, - SCAN_SCAN_ABORT, - SCAN_PAGE_COUNT, - SCAN_PAGE_LRU, - SCAN_PAGE_LOCK, - SCAN_PAGE_ANON, - SCAN_PAGE_COMPOUND, - SCAN_ANY_PROCESS, - SCAN_VMA_NULL, - SCAN_VMA_CHECK, - SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, - SCAN_DEL_PAGE_LRU, - SCAN_ALLOC_HUGE_PAGE_FAIL, - SCAN_CGROUP_CHARGE_FAIL -}; - -#define CREATE_TRACE_POINTS -#include <trace/events/huge_memory.h> - /* * By default transparent hugepage support is disabled in order that avoid * to risk increase the memory footprint of applications without a guaranteed @@ -82,127 +54,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); -/* default scan 8*512 pte (or vmas) every 30 second */ -static unsigned int khugepaged_pages_to_scan __read_mostly; -static unsigned int khugepaged_pages_collapsed; -static unsigned int khugepaged_full_scans; -static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; -/* during fragmentation poll the hugepage allocator once every minute */ -static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; -static unsigned long khugepaged_sleep_expire; -static struct task_struct *khugepaged_thread __read_mostly; -static DEFINE_MUTEX(khugepaged_mutex); -static DEFINE_SPINLOCK(khugepaged_mm_lock); -static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); -/* - * default collapse hugepages if there is at least one pte mapped like - * it would have happened if the vma was large enough during page - * fault. - */ -static unsigned int khugepaged_max_ptes_none __read_mostly; - -static int khugepaged(void *none); -static int khugepaged_slab_init(void); -static void khugepaged_slab_exit(void); - -#define MM_SLOTS_HASH_BITS 10 -static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); - -static struct kmem_cache *mm_slot_cache __read_mostly; - -/** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for - */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; -}; - -/** - * struct khugepaged_scan - cursor for scanning - * @mm_head: the head of the mm list to scan - * @mm_slot: the current mm_slot we are scanning - * @address: the next address inside that to be scanned - * - * There is only the one khugepaged_scan instance of this cursor structure. - */ -struct khugepaged_scan { - struct list_head mm_head; - struct mm_slot *mm_slot; - unsigned long address; -}; -static struct khugepaged_scan khugepaged_scan = { - .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), -}; - static struct shrinker deferred_split_shrinker; -static void set_recommended_min_free_kbytes(void) -{ - struct zone *zone; - int nr_zones = 0; - unsigned long recommended_min; - - for_each_populated_zone(zone) - nr_zones++; - - /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ - recommended_min = pageblock_nr_pages * nr_zones * 2; - - /* - * Make sure that on average at least two pageblocks are almost free - * of another type, one for a migratetype to fall back to and a - * second to avoid subsequent fallbacks of other types There are 3 - * MIGRATE_TYPES we care about. - */ - recommended_min += pageblock_nr_pages * nr_zones * - MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; - - /* don't ever allow to reserve more than 5% of the lowmem */ - recommended_min = min(recommended_min, - (unsigned long) nr_free_buffer_pages() / 20); - recommended_min <<= (PAGE_SHIFT-10); - - if (recommended_min > min_free_kbytes) { - if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", - min_free_kbytes, recommended_min); - - min_free_kbytes = recommended_min; - } - setup_per_zone_wmarks(); -} - -static int start_stop_khugepaged(void) -{ - int err = 0; - if (khugepaged_enabled()) { - if (!khugepaged_thread) - khugepaged_thread = kthread_run(khugepaged, NULL, - "khugepaged"); - if (IS_ERR(khugepaged_thread)) { - pr_err("khugepaged: kthread_run(khugepaged) failed\n"); - err = PTR_ERR(khugepaged_thread); - khugepaged_thread = NULL; - goto fail; - } - - if (!list_empty(&khugepaged_scan.mm_head)) - wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); - } else if (khugepaged_thread) { - kthread_stop(khugepaged_thread); - khugepaged_thread = NULL; - } -fail: - return err; -} - static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; @@ -328,12 +181,7 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err; - - mutex_lock(&khugepaged_mutex); - err = start_stop_khugepaged(); - mutex_unlock(&khugepaged_mutex); - + int err = start_stop_khugepaged(); if (err) ret = err; } @@ -343,7 +191,7 @@ static ssize_t enabled_store(struct kobject *kobj, static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); -static ssize_t single_flag_show(struct kobject *kobj, +ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { @@ -351,7 +199,7 @@ static ssize_t single_flag_show(struct kobject *kobj, !!test_bit(flag, &transparent_hugepage_flags)); } -static ssize_t single_flag_store(struct kobject *kobj, +ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) @@ -406,13 +254,13 @@ static struct kobj_attribute defrag_attr = static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return single_flag_show(kobj, attr, buf, + return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return single_flag_store(kobj, attr, buf, count, + return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static struct kobj_attribute use_zero_page_attr = @@ -421,14 +269,14 @@ static struct kobj_attribute use_zero_page_attr = static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return single_flag_show(kobj, attr, buf, + return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); } static ssize_t debug_cow_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return single_flag_store(kobj, attr, buf, count, + return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); } static struct kobj_attribute debug_cow_attr = @@ -439,6 +287,9 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, +#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) + &shmem_enabled_attr.attr, +#endif #ifdef CONFIG_DEBUG_VM &debug_cow_attr.attr, #endif @@ -449,171 +300,6 @@ static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; -static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); -} - -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) - return -EINVAL; - - khugepaged_scan_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; -} -static struct kobj_attribute scan_sleep_millisecs_attr = - __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, - scan_sleep_millisecs_store); - -static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); -} - -static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; -} -static struct kobj_attribute alloc_sleep_millisecs_attr = - __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, - alloc_sleep_millisecs_store); - -static ssize_t pages_to_scan_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_pages_to_scan); -} -static ssize_t pages_to_scan_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long pages; - - err = kstrtoul(buf, 10, &pages); - if (err || !pages || pages > UINT_MAX) - return -EINVAL; - - khugepaged_pages_to_scan = pages; - - return count; -} -static struct kobj_attribute pages_to_scan_attr = - __ATTR(pages_to_scan, 0644, pages_to_scan_show, - pages_to_scan_store); - -static ssize_t pages_collapsed_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_pages_collapsed); -} -static struct kobj_attribute pages_collapsed_attr = - __ATTR_RO(pages_collapsed); - -static ssize_t full_scans_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_full_scans); -} -static struct kobj_attribute full_scans_attr = - __ATTR_RO(full_scans); - -static ssize_t khugepaged_defrag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return single_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); -} -static ssize_t khugepaged_defrag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - return single_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); -} -static struct kobj_attribute khugepaged_defrag_attr = - __ATTR(defrag, 0644, khugepaged_defrag_show, - khugepaged_defrag_store); - -/* - * max_ptes_none controls if khugepaged should collapse hugepages over - * any unmapped ptes in turn potentially increasing the memory - * footprint of the vmas. When max_ptes_none is 0 khugepaged will not - * reduce the available free memory in the system as it - * runs. Increasing max_ptes_none will instead potentially reduce the - * free memory in the system during the khugepaged scan. - */ -static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", khugepaged_max_ptes_none); -} -static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long max_ptes_none; - - err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR-1) - return -EINVAL; - - khugepaged_max_ptes_none = max_ptes_none; - - return count; -} -static struct kobj_attribute khugepaged_max_ptes_none_attr = - __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, - khugepaged_max_ptes_none_store); - -static struct attribute *khugepaged_attr[] = { - &khugepaged_defrag_attr.attr, - &khugepaged_max_ptes_none_attr.attr, - &pages_to_scan_attr.attr, - &pages_collapsed_attr.attr, - &full_scans_attr.attr, - &scan_sleep_millisecs_attr.attr, - &alloc_sleep_millisecs_attr.attr, - NULL, -}; - -static struct attribute_group khugepaged_attr_group = { - .attrs = khugepaged_attr, - .name = "khugepaged", -}; - static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; @@ -672,8 +358,6 @@ static int __init hugepage_init(void) return -EINVAL; } - khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; - khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; /* * hugepages can't be allocated by the buddy allocator */ @@ -688,7 +372,7 @@ static int __init hugepage_init(void) if (err) goto err_sysfs; - err = khugepaged_slab_init(); + err = khugepaged_init(); if (err) goto err_slab; @@ -719,7 +403,7 @@ err_khugepaged: err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: - khugepaged_slab_exit(); + khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: @@ -765,11 +449,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) -{ - return pmd_mkhuge(mk_pmd(page, prot)); -} - static inline struct list_head *page_deferred_list(struct page *page) { /* @@ -790,26 +469,23 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - struct page *page, gfp_t gfp, - unsigned int flags) +static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, + gfp_t gfp) { + struct vm_area_struct *vma = fe->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - spinlock_t *ptl; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg, true); put_page(page); @@ -824,12 +500,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_none(*pmd))) { - spin_unlock(ptl); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); } else { pmd_t entry; @@ -837,12 +513,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(ptl); + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); - pte_free(mm, pgtable); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + pte_free(vma->vm_mm, pgtable); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -852,11 +527,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, haddr, pmd, entry); - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&mm->nr_ptes); - spin_unlock(ptl); + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&vma->vm_mm->nr_ptes); + spin_unlock(fe->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -864,29 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, } /* - * If THP is set to always then directly reclaim/compact as necessary - * If set to defer then do no reclaim and defer to khugepaged + * If THP defrag is set to always then directly reclaim/compact as necessary + * If set to defer then do only background reclaim/compact and defer to khugepaged * If set to madvise and the VMA is flagged then directly reclaim/compact + * When direct reclaim/compact is allowed, don't retry except for flagged VMA's */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - gfp_t reclaim_flags = 0; + bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && - (vma->vm_flags & VM_HUGEPAGE)) - reclaim_flags = __GFP_DIRECT_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - reclaim_flags = __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - reclaim_flags = __GFP_DIRECT_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags) && vma_madvised) + return GFP_TRANSHUGE; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - return GFP_TRANSHUGE | reclaim_flags; -} - -/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ -static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) -{ - return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -906,13 +578,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - unsigned int flags) +int do_huge_pmd_anonymous_page(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; gfp_t gfp; struct page *page; - unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -920,42 +591,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && + if (!(fe->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { - spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; int ret; - pgtable = pte_alloc_one(mm, haddr); + pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - ptl = pmd_lock(mm, pmd); + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); ret = 0; set = false; - if (pmd_none(*pmd)) { + if (pmd_none(*fe->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(ptl); - ret = handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + spin_unlock(fe->ptl); + ret = handle_userfault(fe, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { - set_huge_zero_page(pgtable, mm, vma, - haddr, pmd, - zero_page); - spin_unlock(ptl); + set_huge_zero_page(pgtable, vma->vm_mm, vma, + haddr, fe->pmd, zero_page); + spin_unlock(fe->ptl); set = true; } } else - spin_unlock(ptl); + spin_unlock(fe->ptl); if (!set) { - pte_free(mm, pgtable); + pte_free(vma->vm_mm, pgtable); put_huge_zero_page(); } return ret; @@ -967,8 +636,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, - flags); + return __do_huge_pmd_anonymous_page(fe, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1080,14 +748,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *src_page; pmd_t pmd; pgtable_t pgtable = NULL; - int ret; + int ret = -ENOMEM; - if (!vma_is_dax(vma)) { - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; - } + /* Skip if can be re-fill on fault */ + if (!vma_is_anonymous(vma)) + return 0; + + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -1095,7 +764,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } @@ -1118,16 +787,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (!vma_is_dax(vma)) { - /* thp accounting separate from pmd_devmap accounting */ - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - get_page(src_page); - page_dup_rmap(src_page, true); - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - } + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); @@ -1141,38 +807,31 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) +void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; pmd_t entry; unsigned long haddr; - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); + haddr = fe->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, + fe->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); } -static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - struct page *page, - unsigned long haddr) +static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, + struct page *page) { + struct vm_area_struct *vma = fe->vma; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; - spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1189,11 +848,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, - vma, address, page_to_nid(page)); + __GFP_OTHER_NODE, vma, + fe->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg, false))) { + mem_cgroup_try_charge(pages[i], vma->vm_mm, + GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { @@ -1219,41 +878,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; + pte_t entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr, false); + page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + fe->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*fe->pte)); + set_pte_at(vma->vm_mm, haddr, fe->pte, entry); + pte_unmap(fe->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); + pmd_populate(vma->vm_mm, fe->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(ptl); + spin_unlock(fe->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1262,8 +921,8 @@ out: return ret; out_free_pages: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + spin_unlock(fe->ptl); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); @@ -1274,25 +933,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) { - spinlock_t *ptl; - int ret = 0; + struct vm_area_struct *vma = fe->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ + int ret = 0; - ptl = pmd_lockptr(mm, pmd); + fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + spin_lock(fe->ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1305,13 +962,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache_pmd(vma, address, pmd); + if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) + update_mmu_cache_pmd(vma, fe->address, fe->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1324,13 +981,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); + ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1339,14 +995,12 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, - true))) { + if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + huge_gfp, &memcg, true))) { put_page(new_page); - if (page) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, fe->pmd, fe->address); + if (page) put_page(page); - } else - split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1362,13 +1016,13 @@ alloc: mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(fe->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(ptl); + if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { + spin_unlock(fe->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1376,14 +1030,14 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache_pmd(vma, address, pmd); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { - add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1392,13 +1046,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(ptl); + spin_unlock(fe->ptl); out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); return ret; } @@ -1432,6 +1086,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * We don't mlock() pte-mapped THPs. This way we can avoid * leaking mlocked pages into non-VM_LOCKED VMAs. * + * For anon THP: + * * In most cases the pmd is the only mapping of the page as we * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for * writable private mappings in populate_vma_page_range(). @@ -1439,15 +1095,26 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * The only scenario when we have the page shared here is if we * mlocking read-only mapping shared over fork(). We skip * mlocking such pages. + * + * For file THP: + * + * We can expect PageDoubleMap() to be stable under page lock: + * for file pages we set it in page_add_file_rmap(), which + * requires page to be locked. */ - if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && - page->mapping && trylock_page(page)) { - lru_add_drain(); - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } + + if (PageAnon(page) && compound_mapcount(page) != 1) + goto skip_mlock; + if (PageDoubleMap(page) || !page->mapping) + goto skip_mlock; + if (!trylock_page(page)) + goto skip_mlock; + lru_add_drain(); + if (page->mapping && !PageDoubleMap(page)) + mlock_vma_page(page); + unlock_page(page); } +skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) @@ -1458,13 +1125,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) { - spinlock_t *ptl; + struct vm_area_struct *vma = fe->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1475,8 +1141,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* A PROT_NONE fault should not end up here */ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_same(pmd, *pmdp))) + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_same(pmd, *fe->pmd))) goto out_unlock; /* @@ -1484,9 +1150,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*pmdp))) { - page = pmd_page(*pmdp); - spin_unlock(ptl); + if (unlikely(pmd_trans_migrating(*fe->pmd))) { + page = pmd_page(*fe->pmd); + spin_unlock(fe->ptl); wait_on_page_locked(page); goto out; } @@ -1519,7 +1185,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(ptl); + spin_unlock(fe->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1530,12 +1196,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * to serialises splits */ get_page(page); - spin_unlock(ptl); + spin_unlock(fe->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(ptl); - if (unlikely(!pmd_same(pmd, *pmdp))) { + spin_lock(fe->ptl); + if (unlikely(!pmd_same(pmd, *fe->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1553,9 +1219,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(ptl); - migrated = migrate_misplaced_transhuge_page(mm, vma, - pmdp, pmd, addr, page, target_nid); + spin_unlock(fe->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, + fe->pmd, pmd, fe->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1570,41 +1236,42 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(mm, haddr, pmdp, pmd); - update_mmu_cache_pmd(vma, addr, pmdp); + set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); + update_mmu_cache_pmd(vma, fe->address, fe->pmd); unlock_page(page); out_unlock: - spin_unlock(ptl); + spin_unlock(fe->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); return 0; } -int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, +/* + * Return true if we do MADV_FREE successfully on entire pmd page. + * Otherwise, return false. + */ +bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next) - { spinlock_t *ptl; pmd_t orig_pmd; struct page *page; struct mm_struct *mm = tlb->mm; - int ret = 0; + bool ret = false; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; orig_pmd = *pmd; - if (is_huge_zero_pmd(orig_pmd)) { - ret = 1; + if (is_huge_zero_pmd(orig_pmd)) goto out; - } page = pmd_page(orig_pmd); /* @@ -1624,14 +1291,9 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (next - addr != HPAGE_PMD_SIZE) { get_page(page); spin_unlock(ptl); - if (split_huge_page(page)) { - put_page(page); - unlock_page(page); - goto out_unlocked; - } + split_huge_page(page); put_page(page); unlock_page(page); - ret = 1; goto out_unlocked; } @@ -1651,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - ret = 1; + ret = true; out: spin_unlock(ptl); out_unlocked: @@ -1689,12 +1351,18 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); - pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); - atomic_long_dec(&tlb->mm->nr_ptes); + if (PageAnon(page)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + pte_free(tlb->mm, pgtable); + atomic_long_dec(&tlb->mm->nr_ptes); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); + } spin_unlock(ptl); - tlb_remove_page(tlb, page); + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } @@ -1784,7 +1452,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mkwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - BUG_ON(!preserve_write && pmd_write(entry)); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && + pmd_write(entry)); } spin_unlock(ptl); } @@ -1793,10 +1462,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Returns true if a given pmd maps a thp, false otherwise. + * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * - * Note that if it returns true, this routine returns without unlocking page - * table lock. So callers must unlock it. + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { @@ -1808,1040 +1477,6 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return NULL; } -#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) - -int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) -{ - switch (advice) { - case MADV_HUGEPAGE: -#ifdef CONFIG_S390 - /* - * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 - * can't handle this properly after s390_enable_sie, so we simply - * ignore the madvise to prevent qemu from causing a SIGSEGV. - */ - if (mm_has_pgste(vma->vm_mm)) - return 0; -#endif - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & VM_NO_THP) - return -EINVAL; - *vm_flags &= ~VM_NOHUGEPAGE; - *vm_flags |= VM_HUGEPAGE; - /* - * If the vma become good for khugepaged to scan, - * register it here without waiting a page fault that - * may not happen any time soon. - */ - if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) - return -ENOMEM; - break; - case MADV_NOHUGEPAGE: - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & VM_NO_THP) - return -EINVAL; - *vm_flags &= ~VM_HUGEPAGE; - *vm_flags |= VM_NOHUGEPAGE; - /* - * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning - * this vma even if we leave the mm registered in khugepaged if - * it got registered before VM_NOHUGEPAGE was set. - */ - break; - } - - return 0; -} - -static int __init khugepaged_slab_init(void) -{ - mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); - if (!mm_slot_cache) - return -ENOMEM; - - return 0; -} - -static void __init khugepaged_slab_exit(void) -{ - kmem_cache_destroy(mm_slot_cache); -} - -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - -static inline int khugepaged_test_exit(struct mm_struct *mm) -{ - return atomic_read(&mm->mm_users) == 0; -} - -int __khugepaged_enter(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - int wakeup; - - mm_slot = alloc_mm_slot(); - if (!mm_slot) - return -ENOMEM; - - /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); - return 0; - } - - spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); - /* - * Insert just behind the scanning cursor, to let the area settle - * down a little. - */ - wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); - spin_unlock(&khugepaged_mm_lock); - - atomic_inc(&mm->mm_count); - if (wakeup) - wake_up_interruptible(&khugepaged_wait); - - return 0; -} - -int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_THP)) - /* khugepaged not yet working on file or special mappings */ - return 0; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart < hend) - return khugepaged_enter(vma, vm_flags); - return 0; -} - -void __khugepaged_exit(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - int free = 0; - - spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); - free = 1; - } - spin_unlock(&khugepaged_mm_lock); - - if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); - mmdrop(mm); - } else if (mm_slot) { - /* - * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_sem. - */ - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } -} - -static void release_pte_page(struct page *page) -{ - /* 0 stands for page_is_file_cache(page) == false */ - dec_zone_page_state(page, NR_ISOLATED_ANON + 0); - unlock_page(page); - putback_lru_page(page); -} - -static void release_pte_pages(pte_t *pte, pte_t *_pte) -{ - while (--_pte >= pte) { - pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); - } -} - -static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, - pte_t *pte) -{ - struct page *page = NULL; - pte_t *_pte; - int none_or_zero = 0, result = 0; - bool referenced = false, writable = false; - - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { - if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - goto out; - } - } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out; - } - page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - result = SCAN_PAGE_NULL; - goto out; - } - - VM_BUG_ON_PAGE(PageCompound(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - - /* - * We can do it before isolate_lru_page because the - * page can't be freed from under us. NOTE: PG_lock - * is needed to serialize against split_huge_page - * when invoked from the VM. - */ - if (!trylock_page(page)) { - result = SCAN_PAGE_LOCK; - goto out; - } - - /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. - */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { - unlock_page(page); - result = SCAN_PAGE_COUNT; - goto out; - } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } - /* - * Page is not in the swap cache. It can be collapsed - * into a THP. - */ - } - - /* - * Isolate the page to avoid collapsing an hugepage - * currently in use by the VM. - */ - if (isolate_lru_page(page)) { - unlock_page(page); - result = SCAN_DEL_PAGE_LRU; - goto out; - } - /* 0 stands for page_is_file_cache(page) == false */ - inc_zone_page_state(page, NR_ISOLATED_ANON + 0); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - - /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) - referenced = true; - } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { - result = SCAN_PAGE_RO; - } - -out: - release_pte_pages(pte, _pte); - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 0; -} - -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl) -{ - pte_t *_pte; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { - pte_t pteval = *_pte; - struct page *src_page; - - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - } - } else { - src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); - /* - * ptl mostly unnecessary, but preempt has to - * be disabled to update the per-cpu stats - * inside page_remove_rmap(). - */ - spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); - spin_unlock(ptl); - free_page_and_swap_cache(src_page); - } - - address += PAGE_SIZE; - page++; - } -} - -static void khugepaged_alloc_sleep(void) -{ - DEFINE_WAIT(wait); - - add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); -} - -static int khugepaged_node_load[MAX_NUMNODES]; - -static bool khugepaged_scan_abort(int nid) -{ - int i; - - /* - * If zone_reclaim_mode is disabled, then no extra effort is made to - * allocate memory locally. - */ - if (!zone_reclaim_mode) - return false; - - /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) - return false; - - for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) - continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) - return true; - } - return false; -} - -#ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) -{ - static int last_khugepaged_target_node = NUMA_NO_NODE; - int nid, target_node = 0, max_value = 0; - - /* find first node with max normal pages hit */ - for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; - target_node = nid; - } - - /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { - target_node = nid; - break; - } - - last_khugepaged_target_node = target_node; - return target_node; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - unsigned long address, int node) -{ - VM_BUG_ON_PAGE(*hpage, *hpage); - - /* - * Before allocating the hugepage, release the mmap_sem read lock. - * The allocation can take potentially a long time if it involves - * sync compaction, and we do not need to hold the mmap_sem during - * that. We will recheck the vma after taking it again in write mode. - */ - up_read(&mm->mmap_sem); - - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; - } - - prep_transhuge_page(*hpage); - count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - unsigned long address, int node) -{ - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - return false; - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_THP); -} - -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) -{ - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; - unsigned long hstart, hend; - struct mem_cgroup *memcg; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - gfp_t gfp; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; - - /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out_nolock; - } - - /* - * Prevent all access to pagetables with the exception of - * gup_fast later hanlded by the ptep_clear_flush and the VM - * handled by the anon_vma lock + PG_lock. - */ - down_write(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) { - result = SCAN_ANY_PROCESS; - goto out; - } - - vma = find_vma(mm, address); - if (!vma) { - result = SCAN_VMA_NULL; - goto out; - } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) { - result = SCAN_ADDRESS_RANGE; - goto out; - } - if (!hugepage_vma_check(vma)) { - result = SCAN_VMA_CHECK; - goto out; - } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; - goto out; - } - - anon_vma_lock_write(vma->anon_vma); - - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); - - mmun_start = address; - mmun_end = address + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ - /* - * After this gup_fast can't run anymore. This also removes - * any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address - * to avoid the risk of CPU bugs in that area. - */ - _pmd = pmdp_collapse_flush(vma, address, pmd); - spin_unlock(pmd_ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(pte_ptl); - - if (unlikely(!isolated)) { - pte_unmap(pte); - spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); - /* - * We can only use set_pmd_at when establishing - * hugepmds and never for establishing regular pmds that - * points to regular pagetables. Use pmd_populate for that - */ - pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(pmd_ptl); - anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; - goto out; - } - - /* - * All pages are isolated and locked so anon_vma rmap - * can't run anymore. - */ - anon_vma_unlock_write(vma->anon_vma); - - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); - pte_unmap(pte); - __SetPageUptodate(new_page); - pgtable = pmd_pgtable(_pmd); - - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - - spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(pmd_ptl); - - *hpage = NULL; - - khugepaged_pages_collapsed++; - result = SCAN_SUCCEED; -out_up_write: - up_write(&mm->mmap_sem); - trace_mm_collapse_huge_page(mm, isolated, result); - return; - -out_nolock: - trace_mm_collapse_huge_page(mm, isolated, result); - return; -out: - mem_cgroup_cancel_charge(new_page, memcg, true); - goto out_up_write; -} - -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) -{ - pmd_t *pmd; - pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0; - struct page *page = NULL; - unsigned long _address; - spinlock_t *ptl; - int node = NUMA_NO_NODE; - bool writable = false, referenced = false; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; - goto out; - } - - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - goto out_unmap; - } - } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } - if (pte_write(pteval)) - writable = true; - - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { - result = SCAN_PAGE_NULL; - goto out_unmap; - } - - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; - goto out_unmap; - } - - /* - * Record which node the original page is from and save this - * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max - * hit record. - */ - node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { - result = SCAN_SCAN_ABORT; - goto out_unmap; - } - khugepaged_node_load[node]++; - if (!PageLRU(page)) { - result = SCAN_PAGE_LRU; - goto out_unmap; - } - if (PageLocked(page)) { - result = SCAN_PAGE_LOCK; - goto out_unmap; - } - if (!PageAnon(page)) { - result = SCAN_PAGE_ANON; - goto out_unmap; - } - - /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. - */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { - result = SCAN_PAGE_COUNT; - goto out_unmap; - } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) - referenced = true; - } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_NO_REFERENCED_PAGE; - } - } else { - result = SCAN_PAGE_RO; - } -out_unmap: - pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); - /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node); - } -out: - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, - none_or_zero, result); - return ret; -} - -static void collect_mm_slot(struct mm_slot *mm_slot) -{ - struct mm_struct *mm = mm_slot->mm; - - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); - - if (khugepaged_test_exit(mm)) { - /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); - - /* - * Not strictly needed because the mm exited already. - * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - */ - - /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); - mmdrop(mm); - } -} - -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) - __releases(&khugepaged_mm_lock) - __acquires(&khugepaged_mm_lock) -{ - struct mm_slot *mm_slot; - struct mm_struct *mm; - struct vm_area_struct *vma; - int progress = 0; - - VM_BUG_ON(!pages); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); - - if (khugepaged_scan.mm_slot) - mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, - struct mm_slot, mm_node); - khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; - } - spin_unlock(&khugepaged_mm_lock); - - mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else - vma = find_vma(mm, khugepaged_scan.address); - - progress++; - for (; vma; vma = vma->vm_next) { - unsigned long hstart, hend; - - cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { - progress++; - break; - } - if (!hugepage_vma_check(vma)) { -skip: - progress++; - continue; - } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; - if (khugepaged_scan.address > hend) - goto skip; - if (khugepaged_scan.address < hstart) - khugepaged_scan.address = hstart; - VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); - - while (khugepaged_scan.address < hend) { - int ret; - cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) - goto breakouterloop; - - VM_BUG_ON(khugepaged_scan.address < hstart || - khugepaged_scan.address + HPAGE_PMD_SIZE > - hend); - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); - /* move to next address */ - khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_sem so break loop */ - goto breakouterloop_mmap_sem; - if (progress >= pages) - goto breakouterloop; - } - } -breakouterloop: - up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ -breakouterloop_mmap_sem: - - spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); - /* - * Release the current mm_slot if this mm is about to die, or - * if we scanned all vmas of this mm. - */ - if (khugepaged_test_exit(mm) || !vma) { - /* - * Make sure that if mm_users is reaching zero while - * khugepaged runs here, khugepaged_exit will find - * mm_slot not pointing to the exiting mm. - */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); - khugepaged_scan.address = 0; - } else { - khugepaged_scan.mm_slot = NULL; - khugepaged_full_scans++; - } - - collect_mm_slot(mm_slot); - } - - return progress; -} - -static int khugepaged_has_work(void) -{ - return !list_empty(&khugepaged_scan.mm_head) && - khugepaged_enabled(); -} - -static int khugepaged_wait_event(void) -{ - return !list_empty(&khugepaged_scan.mm_head) || - kthread_should_stop(); -} - -static void khugepaged_do_scan(void) -{ - struct page *hpage = NULL; - unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; - bool wait = true; - - barrier(); /* write khugepaged_pages_to_scan to local stack */ - - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - - cond_resched(); - - if (unlikely(kthread_should_stop() || try_to_freeze())) - break; - - spin_lock(&khugepaged_mm_lock); - if (!khugepaged_scan.mm_slot) - pass_through_head++; - if (khugepaged_has_work() && - pass_through_head < 2) - progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); - else - progress = pages; - spin_unlock(&khugepaged_mm_lock); - } - - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); -} - -static bool khugepaged_should_wakeup(void) -{ - return kthread_should_stop() || - time_after_eq(jiffies, khugepaged_sleep_expire); -} - -static void khugepaged_wait_work(void) -{ - if (khugepaged_has_work()) { - const unsigned long scan_sleep_jiffies = - msecs_to_jiffies(khugepaged_scan_sleep_millisecs); - - if (!scan_sleep_jiffies) - return; - - khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; - wait_event_freezable_timeout(khugepaged_wait, - khugepaged_should_wakeup(), - scan_sleep_jiffies); - return; - } - - if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); -} - -static int khugepaged(void *none) -{ - struct mm_slot *mm_slot; - - set_freezable(); - set_user_nice(current, MAX_NICE); - - while (!kthread_should_stop()) { - khugepaged_do_scan(); - khugepaged_wait_work(); - } - - spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; - khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); - spin_unlock(&khugepaged_mm_lock); - return 0; -} - static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { @@ -2888,10 +1523,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); - if (vma_is_dax(vma)) { - pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + if (!vma_is_anonymous(vma)) { + _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); if (is_huge_zero_pmd(_pmd)) put_huge_zero_page(); + if (vma_is_dax(vma)) + return; + page = pmd_page(_pmd); + if (!PageReferenced(page) && pmd_young(_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { return __split_huge_zero_page_pmd(vma, haddr, pmd); @@ -2947,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __dec_node_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2989,7 +1632,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze) + unsigned long address, bool freeze, struct page *page) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; @@ -2997,8 +1640,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { - struct page *page = pmd_page(*pmd); + page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) @@ -3025,24 +1677,8 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) - return; - /* - * If caller asks to setup a migration entries, we need a page to check - * pmd against. Otherwise we can end up replacing wrong page. - */ - VM_BUG_ON(freeze && !page); - if (page && page != pmd_page(*pmd)) - return; - - /* - * Caller holds the mmap_sem write mode or the anon_vma lock, - * so a huge pmd cannot materialize from under us (khugepaged - * holds both the mmap_sem write mode and the anon_vma lock - * write mode). - */ - __split_huge_pmd(vma, pmd, address, freeze); + __split_huge_pmd(vma, pmd, address, freeze, page); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3088,12 +1724,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void freeze_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | - TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + TTU_RMAP_LOCKED; int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); + if (PageAnon(page)) + ttu_flags |= TTU_MIGRATION; + /* We only need TTU_SPLIT_HUGE_PMD once */ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { @@ -3103,7 +1742,7 @@ static void freeze_page(struct page *page) ret = try_to_unmap(page + i, ttu_flags); } - VM_BUG_ON(ret); + VM_BUG_ON_PAGE(ret, page + i - 1); } static void unfreeze_page(struct page *page) @@ -3125,15 +1764,20 @@ static void __split_huge_page_tail(struct page *head, int tail, /* * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_inc(), we - * would then run atomic_set() concurrently with + * tail_page. If we used atomic_set() below instead of atomic_inc() or + * atomic_add(), we would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_inc(). + * it's safer to use atomic_inc()/atomic_add(). */ - page_ref_inc(page_tail); + if (PageAnon(head)) { + page_ref_inc(page_tail); + } else { + /* Additional pin to radix tree */ + page_ref_add(page_tail, 2); + } page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3169,25 +1813,46 @@ static void __split_huge_page_tail(struct page *head, int tail, lru_add_page_tail(head, page_tail, lruvec, list); } -static void __split_huge_page(struct page *page, struct list_head *list) +static void __split_huge_page(struct page *page, struct list_head *list, + unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; + pgoff_t end = -1; int i; - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(head, zone); + lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) + if (!PageAnon(page)) + end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); + + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); + /* Some pages can be beyond i_size: drop them from page cache */ + if (head[i].index >= end) { + __ClearPageDirty(head + i); + __delete_from_page_cache(head + i, NULL); + if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + shmem_uncharge(head->mapping->host, 1); + put_page(head + i); + } + } ClearPageCompound(head); - spin_unlock_irq(&zone->lru_lock); + /* See comment in __split_huge_page_tail() */ + if (PageAnon(head)) { + page_ref_inc(head); + } else { + /* Additional pin to radix tree */ + page_ref_add(head, 2); + spin_unlock(&head->mapping->tree_lock); + } + + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); @@ -3210,18 +1875,22 @@ static void __split_huge_page(struct page *page, struct list_head *list) int total_mapcount(struct page *page) { - int i, ret; + int i, compound, ret; VM_BUG_ON_PAGE(PageTail(page), page); if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - ret = compound_mapcount(page); + compound = compound_mapcount(page); if (PageHuge(page)) - return ret; + return compound; + ret = compound; for (i = 0; i < HPAGE_PMD_NR; i++) ret += atomic_read(&page[i]._mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!PageAnon(page)) + return ret - compound * HPAGE_PMD_NR; if (PageDoubleMap(page)) ret -= HPAGE_PMD_NR; return ret; @@ -3308,36 +1977,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); - struct anon_vma *anon_vma; - int count, mapcount, ret; + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; + int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); - /* - * The caller does not necessarily hold an mmap_sem that would prevent - * the anon_vma disappearing so we first we take a reference to it - * and then lock the anon_vma for write. This is similar to - * page_lock_anon_vma_read except the write lock is taken to serialise - * against parallel split or collapse operations. - */ - anon_vma = page_get_anon_vma(head); - if (!anon_vma) { - ret = -EBUSY; - goto out; + if (PageAnon(head)) { + /* + * The caller does not necessarily hold an mmap_sem that would + * prevent the anon_vma disappearing so we first we take a + * reference to it and then lock the anon_vma for write. This + * is similar to page_lock_anon_vma_read except the write lock + * is taken to serialise against parallel split or collapse + * operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + extra_pins = 0; + mapping = NULL; + anon_vma_lock_write(anon_vma); + } else { + mapping = head->mapping; + + /* Truncated ? */ + if (!mapping) { + ret = -EBUSY; + goto out; + } + + /* Addidional pins from radix tree */ + extra_pins = HPAGE_PMD_NR; + anon_vma = NULL; + i_mmap_lock_read(mapping); } - anon_vma_lock_write(anon_vma); /* * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - 1) { + if (total_mapcount(head) != page_count(head) - extra_pins - 1) { ret = -EBUSY; goto out_unlock; } @@ -3350,35 +2037,62 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); + + if (mapping) { + void **pslot; + + spin_lock(&mapping->tree_lock); + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(head)); + /* + * Check if the head page is present in radix tree. + * We assume all tail are present too, if head is there. + */ + if (radix_tree_deref_slot_protected(pslot, + &mapping->tree_lock) != head) + goto fail; + } + /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock(&pgdata->split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); - if (!mapcount && count == 1) { + if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - __split_huge_page(page, list); + if (mapping) + __dec_node_page_state(page, NR_SHMEM_THPS); + spin_unlock(&pgdata->split_queue_lock); + __split_huge_page(page, list, flags); ret = 0; - } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); } else { - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + BUG(); + } + spin_unlock(&pgdata->split_queue_lock); +fail: if (mapping) + spin_unlock(&mapping->tree_lock); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; } out_unlock: - anon_vma_unlock_write(anon_vma); - put_anon_vma(anon_vma); + if (anon_vma) { + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); + } + if (mapping) + i_mmap_unlock_read(mapping); out: count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; @@ -3501,8 +2215,7 @@ static int split_huge_pages_set(void *data, u64 val) if (zone != page_zone(page)) goto next; - if (!PageHead(page) || !PageAnon(page) || - PageHuge(page)) + if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) goto next; total++; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c1f3c0be150a..f904246a8fd5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1022,7 +1022,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) +#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ + ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ + defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -3177,7 +3179,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; @@ -3196,19 +3197,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; -again: for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, &address, ptep)) - goto unlock; + if (huge_pmd_unshare(mm, &address, ptep)) { + spin_unlock(ptl); + continue; + } pte = huge_ptep_get(ptep); - if (huge_pte_none(pte)) - goto unlock; + if (huge_pte_none(pte)) { + spin_unlock(ptl); + continue; + } /* * Migrating hugepage or HWPoisoned hugepage is already @@ -3216,7 +3220,8 @@ again: */ if (unlikely(!pte_present(pte))) { huge_pte_clear(mm, address, ptep); - goto unlock; + spin_unlock(ptl); + continue; } page = pte_page(pte); @@ -3226,9 +3231,10 @@ again: * are about to unmap is the actual page of interest. */ if (ref_page) { - if (page != ref_page) - goto unlock; - + if (page != ref_page) { + spin_unlock(ptl); + continue; + } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than @@ -3244,30 +3250,14 @@ again: hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, true); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) { - address += sz; - spin_unlock(ptl); - break; - } - /* Bail out after unmapping reference page if supplied */ - if (ref_page) { - spin_unlock(ptl); - break; - } -unlock: + spin_unlock(ptl); - } - /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. - */ - if (force_flush) { - force_flush = 0; - tlb_flush_mmu(tlb); - if (address < end && !ref_page) - goto again; + tlb_remove_page_size(tlb, page, huge_page_size(h)); + /* + * Bail out after unmapping reference page if supplied + */ + if (ref_page) + break; } mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); @@ -3326,7 +3316,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - mapping = file_inode(vma->vm_file)->i_mapping; + mapping = vma->vm_file->f_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -3383,7 +3373,7 @@ retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - page_move_anon_rmap(old_page, vma, address); + page_move_anon_rmap(old_page, vma); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -4401,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, /* * This function is called from memory failure code. - * Assume the caller holds page lock of the head page. */ int dequeue_hwpoisoned_huge_page(struct page *hpage) { diff --git a/mm/internal.h b/mm/internal.h index 2524ec880e24..1501304f87a4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,6 +36,8 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +int do_swap_page(struct fault_env *fe, pte_t orig_pte); + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -76,7 +78,7 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool zone_reclaimable(struct zone *zone); +extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -150,6 +152,8 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); +extern void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags); extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -181,10 +185,7 @@ struct compact_control { const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; - int contended; /* Signal need_sched() or lock - * contention detected during - * compaction - */ + bool contended; /* Signal lock or sched contention */ }; unsigned long @@ -429,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define ZONE_RECLAIM_NOSCAN -2 -#define ZONE_RECLAIM_FULL -1 -#define ZONE_RECLAIM_SOME 0 -#define ZONE_RECLAIM_SUCCESS 1 +#define NODE_RECLAIM_NOSCAN -2 +#define NODE_RECLAIM_FULL -1 +#define NODE_RECLAIM_SOME 0 +#define NODE_RECLAIM_SUCCESS 1 extern int hwpoison_filter(struct page *p); @@ -463,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ -#define ALLOC_FAIR 0x100 /* fair zone allocation */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1548749a3d45..2976a9ee104f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o -obj-$(CONFIG_SLAB) += quarantine.o +obj-y := kasan.o report.o kasan_init.o quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6845f9294696..b6f99e81bfeb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } -#ifdef CONFIG_SLAB /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. @@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) { int redzone_adjust; - /* Make sure the adjusted size is still less than - * KMALLOC_MAX_CACHE_SIZE. - * TODO: this check is only useful for SLAB, but not SLUB. We'll need - * to skip it for SLUB when it starts using kasan_cache_create(). - */ - if (*size > KMALLOC_MAX_CACHE_SIZE - - sizeof(struct kasan_alloc_meta) - - sizeof(struct kasan_free_meta)) - return; - *flags |= SLAB_KASAN; + int orig_size = *size; + /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; *size += sizeof(struct kasan_alloc_meta); @@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, } redzone_adjust = optimal_redzone(cache->object_size) - (*size - cache->object_size); + if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_CACHE_SIZE, - max(*size, - cache->object_size + - optimal_redzone(cache->object_size))); + + *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + optimal_redzone(cache->object_size))); + + /* + * If the metadata doesn't fit, don't enable KASAN at all. + */ + if (*size <= cache->kasan_info.alloc_meta_offset || + *size <= cache->kasan_info.free_meta_offset) { + cache->kasan_info.alloc_meta_offset = 0; + cache->kasan_info.free_meta_offset = 0; + *size = orig_size; + return; + } + + *flags |= SLAB_KASAN; } -#endif void kasan_cache_shrink(struct kmem_cache *cache) { @@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache) quarantine_remove_cache(cache); } +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + (cache->kasan_info.free_meta_offset ? + sizeof(struct kasan_free_meta) : 0); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); alloc_info->state = KASAN_STATE_INIT; } -#endif } -#ifdef CONFIG_SLAB static inline int in_irqentry_text(unsigned long ptr) { return (ptr >= (unsigned long)&__irqentry_text_start && @@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); return (void *)object + cache->kasan_info.free_meta_offset; } -#endif void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) { @@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) bool kasan_slab_free(struct kmem_cache *cache, void *object) { -#ifdef CONFIG_SLAB /* RCU slabs could be legally used after free within the RCU period */ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return false; if (likely(cache->flags & SLAB_KASAN)) { - struct kasan_alloc_meta *alloc_info = - get_alloc_info(cache, object); - struct kasan_free_meta *free_info = - get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info; + struct kasan_free_meta *free_info; + + alloc_info = get_alloc_info(cache, object); + free_info = get_free_info(cache, object); switch (alloc_info->state) { case KASAN_STATE_ALLOC: @@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) } } return false; -#else - kasan_poison_slab_free(cache, object); - return false; -#endif } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); @@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, alloc_info->alloc_size = size; set_track(&alloc_info->track, flags); } -#endif } EXPORT_SYMBOL(kasan_kmalloc); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb87923552ef..31972cdba433 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, struct kasan_free_meta *get_free_info(struct kmem_cache *cache, const void *object); - static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) @@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 4973505a9bdd..65793f150d1f 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -238,30 +238,23 @@ static void qlist_move_cache(struct qlist_head *from, struct qlist_head *to, struct kmem_cache *cache) { - struct qlist_node *prev = NULL, *curr; + struct qlist_node *curr; if (unlikely(qlist_empty(from))) return; curr = from->head; + qlist_init(from); while (curr) { - struct qlist_node *qlink = curr; - struct kmem_cache *obj_cache = qlink_to_cache(qlink); - - if (obj_cache == cache) { - if (unlikely(from->head == qlink)) { - from->head = curr->next; - prev = curr; - } else - prev->next = curr->next; - if (unlikely(from->tail == qlink)) - from->tail = curr->next; - from->bytes -= cache->size; - qlist_put(to, qlink, cache->size); - } else { - prev = curr; - } - curr = curr->next; + struct qlist_node *next = curr->next; + struct kmem_cache *obj_cache = qlink_to_cache(curr); + + if (obj_cache == cache) + qlist_put(to, curr, obj_cache->size); + else + qlist_put(from, curr, obj_cache->size); + + curr = next; } } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b3c122ddd454..861b9776841a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -#ifdef CONFIG_SLAB static void print_track(struct kasan_track *track) { pr_err("PID = %u\n", track->pid); @@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track) } } -static void object_err(struct kmem_cache *cache, struct page *page, - void *object, char *unused_reason) +static void kasan_object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); struct kasan_free_meta *free_info; @@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page, break; } } -#endif static void print_address_description(struct kasan_access_info *info) { @@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info) struct kmem_cache *cache = page->slab_cache; object = nearest_obj(cache, page, (void *)info->access_addr); - object_err(cache, page, object, + kasan_object_err(cache, page, object, "kasan: bad access detected"); return; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c new file mode 100644 index 000000000000..79c52d0061af --- /dev/null +++ b/mm/khugepaged.c @@ -0,0 +1,1922 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/mm_inline.h> +#include <linux/kthread.h> +#include <linux/khugepaged.h> +#include <linux/freezer.h> +#include <linux/mman.h> +#include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> +#include <linux/page_idle.h> +#include <linux/swapops.h> +#include <linux/shmem_fs.h> + +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include "internal.h" + +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_EXCEED_NONE_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PAGE_RO, + SCAN_LACK_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_SWAP_CACHE_PAGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL, + SCAN_EXCEED_SWAP_PTE, + SCAN_TRUNCATED, +}; + +#define CREATE_TRACE_POINTS +#include <trace/events/huge_memory.h> + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly; +static unsigned int khugepaged_max_ptes_swap __read_mostly; + +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + +static struct kmem_cache *mm_slot_cache __read_mostly; + +/** + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: hash collision list + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct mm_slot *mm_slot; + unsigned long address; +}; + +static struct khugepaged_scan khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = kstrtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_hugepage_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_hugepage_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = kstrtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); +} + +static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_swap; + + err = kstrtoul(buf, 10, &max_ptes_swap); + if (err || max_ptes_swap > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_swap = max_ptes_swap; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_swap_attr = + __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, + khugepaged_max_ptes_swap_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + NULL, +}; + +struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; + +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (!(*vm_flags & VM_NO_KHUGEPAGED) && + khugepaged_enter_vma_merge(vma, *vm_flags)) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +int __init khugepaged_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; + khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + + return 0; +} + +void __init khugepaged_destroy(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) + if (mm == mm_slot->mm) + return mm_slot; + + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + mm_slot->mm = mm; + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) + /* khugepaged not yet working on file or special mappings */ + return 0; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma, vm_flags); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + spin_unlock(&khugepaged_mm_lock); + + if (free) { + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_node_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) + release_pte_page(pte_page(pteval)); + } +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page = NULL; + pte_t *_pte; + int none_or_zero = 0, result = 0, referenced = 0; + bool writable = false; + + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval) || (pte_present(pteval) && + is_zero_pfn(pte_pfn(pteval)))) { + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + goto out; + } + } + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out; + } + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto out; + } + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + unlock_page(page); + result = SCAN_PAGE_COUNT; + goto out; + } + if (pte_write(pteval)) { + writable = true; + } else { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; + } + /* + * Page is not in the swap cache. It can be collapsed + * into a THP. + */ + } + + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + result = SCAN_DEL_PAGE_LRU; + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_node_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + + /* There should be enough young pte to collapse the page */ + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced++; + } + if (likely(writable)) { + if (likely(referenced)) { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; + } + } else { + result = SCAN_PAGE_RO; + } + +out: + release_pte_pages(pte, _pte); + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 0; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page, false); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + freezable_schedule_timeout_interruptible( + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +static int khugepaged_node_load[MAX_NUMNODES]; + +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If node_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!node_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) +{ + return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; +} + +#ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +{ + VM_BUG_ON_PAGE(*hpage, *hpage); + + *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + *hpage = ERR_PTR(-ENOMEM); + return NULL; + } + + prep_transhuge_page(*hpage); + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_khugepaged_hugepage(void) +{ + struct page *page; + + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; +} + +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_khugepaged_hugepage(); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +{ + VM_BUG_ON(!*hpage); + + return *hpage; +} +#endif + +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + if (shmem_file(vma->vm_file)) { + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return false; + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + return !(vma->vm_flags & VM_NO_KHUGEPAGED); +} + +/* + * If mmap_sem temporarily dropped, revalidate vma + * before taking mmap_sem. + * Return 0 if succeeds, otherwise return none-zero + * value (scan code). + */ + +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + unsigned long hstart, hend; + + if (unlikely(khugepaged_test_exit(mm))) + return SCAN_ANY_PROCESS; + + vma = find_vma(mm, address); + if (!vma) + return SCAN_VMA_NULL; + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + return SCAN_ADDRESS_RANGE; + if (!hugepage_vma_check(vma)) + return SCAN_VMA_CHECK; + return 0; +} + +/* + * Bring missing pages in from swap, to complete THP collapse. + * Only done if khugepaged_scan_pmd believes it is worthwhile. + * + * Called and returns without pte mapped or spinlocks held, + * but with mmap_sem held to protect against vma changes. + */ + +static bool __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + int referenced) +{ + pte_t pteval; + int swapped_in = 0, ret = 0; + struct fault_env fe = { + .vma = vma, + .address = address, + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + fe.pte = pte_offset_map(pmd, address); + for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; + fe.pte++, fe.address += PAGE_SIZE) { + pteval = *fe.pte; + if (!is_swap_pte(pteval)) + continue; + swapped_in++; + /* we only decide to swapin, if there is enough young ptes */ + if (referenced < HPAGE_PMD_NR/2) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } + ret = do_swap_page(&fe, pteval); + + /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (hugepage_vma_revalidate(mm, address)) { + /* vma is no longer available, don't continue to swapin */ + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } + /* check if the pmd is still valid */ + if (mm_find_pmd(mm, address) != pmd) + return false; + } + if (ret & VM_FAULT_ERROR) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } + /* pte is unmapped now, we need to map it */ + fe.pte = pte_offset_map(pmd, fe.address); + } + fe.pte--; + pte_unmap(fe.pte); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); + return true; +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node, int referenced) +{ + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *pmd_ptl, *pte_ptl; + int isolated = 0, result = 0; + struct mem_cgroup *memcg; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + + /* + * Before allocating the hugepage, release the mmap_sem read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_sem during + * that. We will recheck the vma after taking it again in write mode. + */ + up_read(&mm->mmap_sem); + new_page = khugepaged_alloc_page(hpage, gfp, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out_nolock; + } + + down_read(&mm->mmap_sem); + result = hugepage_vma_revalidate(mm, address); + if (result) { + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + pmd = mm_find_pmd(mm, address); + if (!pmd) { + result = SCAN_PMD_NULL; + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + /* + * __collapse_huge_page_swapin always returns with mmap_sem locked. + * If it fails, we release mmap_sem and jump out_nolock. + * Continuing to collapse causes inconsistency. + */ + if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { + mem_cgroup_cancel_charge(new_page, memcg, true); + up_read(&mm->mmap_sem); + goto out_nolock; + } + + up_read(&mm->mmap_sem); + /* + * Prevent all access to pagetables with the exception of + * gup_fast later handled by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + result = hugepage_vma_revalidate(mm, address); + if (result) + goto out; + /* check if the pmd is still valid */ + if (mm_find_pmd(mm, address) != pmd) + goto out; + + anon_vma_lock_write(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + pte_ptl = pte_lockptr(mm, pmd); + + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_collapse_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + spin_lock(pte_ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(pte_ptl); + + if (unlikely(!isolated)) { + pte_unmap(pte); + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock_write(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + pte_unmap(pte); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_active_or_unevictable(new_page, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); + + *hpage = NULL; + + khugepaged_pages_collapsed++; + result = SCAN_SUCCEED; +out_up_write: + up_write(&mm->mmap_sem); +out_nolock: + trace_mm_collapse_huge_page(mm, isolated, result); + return; +out: + mem_cgroup_cancel_charge(new_page, memcg, true); + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + struct page *page = NULL; + unsigned long _address; + spinlock_t *ptl; + int node = NUMA_NO_NODE, unmapped = 0; + bool writable = false; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pmd = mm_find_pmd(mm, address); + if (!pmd) { + result = SCAN_PMD_NULL; + goto out; + } + + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (is_swap_pte(pteval)) { + if (++unmapped <= khugepaged_max_ptes_swap) { + continue; + } else { + result = SCAN_EXCEED_SWAP_PTE; + goto out_unmap; + } + } + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + goto out_unmap; + } + } + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + goto out_unmap; + } + if (pte_write(pteval)) + writable = true; + + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out_unmap; + } + + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unmap; + } + + /* + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. + */ + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; + goto out_unmap; + } + khugepaged_node_load[node]++; + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + result = SCAN_PAGE_COUNT; + goto out_unmap; + } + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced++; + } + if (writable) { + if (referenced) { + result = SCAN_SUCCEED; + ret = 1; + } else { + result = SCAN_LACK_REFERENCED_PAGE; + } + } else { + result = SCAN_PAGE_RO; + } +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + node = khugepaged_find_target_node(); + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma, node, referenced); + } +out: + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + none_or_zero, result, unmapped); + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +{ + struct vm_area_struct *vma; + unsigned long addr; + pmd_t *pmd, _pmd; + + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + /* probably overkill */ + if (vma->anon_vma) + continue; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr & ~HPAGE_PMD_MASK) + continue; + if (vma->vm_end < addr + HPAGE_PMD_SIZE) + continue; + pmd = mm_find_pmd(vma->vm_mm, addr); + if (!pmd) + continue; + /* + * We need exclusive mmap_sem to retract page table. + * If trylock fails we would end up with pte-mapped THP after + * re-fault. Not ideal, but it's more important to not disturb + * the system too much. + */ + if (down_write_trylock(&vma->vm_mm->mmap_sem)) { + spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); + /* assume page table is clear */ + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + up_write(&vma->vm_mm->mmap_sem); + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } + } + i_mmap_unlock_write(mapping); +} + +/** + * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * + * Basic scheme is simple, details are more complex: + * - allocate and freeze a new huge page; + * - scan over radix tree replacing old pages the new one + * + swap in pages if necessary; + * + fill in gaps; + * + keep old pages around in case if rollback is required; + * - if replacing succeed: + * + copy data over; + * + free old pages; + * + unfreeze huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the radix-tree; + * + free huge page; + */ +static void collapse_shmem(struct mm_struct *mm, + struct address_space *mapping, pgoff_t start, + struct page **hpage, int node) +{ + gfp_t gfp; + struct page *page, *new_page, *tmp; + struct mem_cgroup *memcg; + pgoff_t index, end = start + HPAGE_PMD_NR; + LIST_HEAD(pagelist); + struct radix_tree_iter iter; + void **slot; + int nr_none = 0, result = SCAN_SUCCEED; + + VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_khugepaged_gfpmask() | + __GFP_OTHER_NODE | __GFP_THISNODE; + + new_page = khugepaged_alloc_page(hpage, gfp, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out; + } + + new_page->index = start; + new_page->mapping = mapping; + __SetPageSwapBacked(new_page); + __SetPageLocked(new_page); + BUG_ON(!page_ref_freeze(new_page, 1)); + + + /* + * At this point the new_page is 'frozen' (page_count() is zero), locked + * and not up-to-date. It's safe to insert it into radix tree, because + * nobody would be able to map it or use it in other way until we + * unfreeze it. + */ + + index = start; + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + int n = min(iter.index, end) - index; + + /* + * Handle holes in the radix tree: charge it from shmem and + * insert relevant subpage of new_page into the radix-tree. + */ + if (n && !shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + break; + } + nr_none += n; + for (; index < min(iter.index, end); index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + + /* We are done. */ + if (index >= end) + break; + + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { + spin_unlock_irq(&mapping->tree_lock); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto tree_unlocked; + } + spin_lock_irq(&mapping->tree_lock); + } else if (trylock_page(page)) { + get_page(page); + } else { + result = SCAN_PAGE_LOCK; + break; + } + + /* + * The page must be locked, so we can drop the tree_lock + * without racing with truncate. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (page_mapping(page) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } + spin_unlock_irq(&mapping->tree_lock); + + if (isolate_lru_page(page)) { + result = SCAN_DEL_PAGE_LRU; + goto out_isolate_failed; + } + + if (page_mapped(page)) + unmap_mapping_range(mapping, index << PAGE_SHIFT, + PAGE_SIZE, 0); + + spin_lock_irq(&mapping->tree_lock); + + VM_BUG_ON_PAGE(page_mapped(page), page); + + /* + * The page is expected to have page_count() == 3: + * - we hold a pin on it; + * - one reference from radix tree; + * - one from isolate_lru_page; + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; + goto out_lru; + } + + /* + * Add the page to the list to be able to undo the collapse if + * something go wrong. + */ + list_add_tail(&page->lru, &pagelist); + + /* Finally, replace with the new page. */ + radix_tree_replace_slot(slot, + new_page + (index % HPAGE_PMD_NR)); + + index++; + continue; +out_lru: + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); +out_isolate_failed: + unlock_page(page); + put_page(page); + goto tree_unlocked; +out_unlock: + unlock_page(page); + put_page(page); + break; + } + + /* + * Handle hole in radix tree at the end of the range. + * This code only triggers if there's nothing in radix tree + * beyond 'end'. + */ + if (result == SCAN_SUCCEED && index < end) { + int n = end - index; + + if (!shmem_charge(mapping->host, n)) { + result = SCAN_FAIL; + goto tree_locked; + } + + for (; index < end; index++) { + radix_tree_insert(&mapping->page_tree, index, + new_page + (index % HPAGE_PMD_NR)); + } + nr_none += n; + } + +tree_locked: + spin_unlock_irq(&mapping->tree_lock); +tree_unlocked: + + if (result == SCAN_SUCCEED) { + unsigned long flags; + struct zone *zone = page_zone(new_page); + + /* + * Replacing old pages with new one has succeed, now we need to + * copy the content and free old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + copy_highpage(new_page + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); + unlock_page(page); + page_ref_unfreeze(page, 1); + page->mapping = NULL; + ClearPageActive(page); + ClearPageUnevictable(page); + put_page(page); + } + + local_irq_save(flags); + __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + } + local_irq_restore(flags); + + /* + * Remove pte page tables, so we can re-faulti + * the page as huge. + */ + retract_page_tables(mapping, start); + + /* Everything is ready, let's unfreeze the new_page */ + set_page_dirty(new_page); + SetPageUptodate(new_page); + page_ref_unfreeze(new_page, HPAGE_PMD_NR); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_anon(new_page); + unlock_page(new_page); + + *hpage = NULL; + } else { + /* Something went wrong: rollback changes to the radix-tree */ + shmem_uncharge(mapping->host, nr_none); + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, + start) { + if (iter.index >= end) + break; + page = list_first_entry_or_null(&pagelist, + struct page, lru); + if (!page || iter.index < page->index) { + if (!nr_none) + break; + /* Put holes back where they were */ + radix_tree_replace_slot(slot, NULL); + nr_none--; + continue; + } + + VM_BUG_ON_PAGE(page->index != iter.index, page); + + /* Unfreeze the page. */ + list_del(&page->lru); + page_ref_unfreeze(page, 2); + radix_tree_replace_slot(slot, page); + spin_unlock_irq(&mapping->tree_lock); + putback_lru_page(page); + unlock_page(page); + spin_lock_irq(&mapping->tree_lock); + } + VM_BUG_ON(nr_none); + spin_unlock_irq(&mapping->tree_lock); + + /* Unfreeze new_page, caller would take care about freeing it */ + page_ref_unfreeze(new_page, 1); + mem_cgroup_cancel_charge(new_page, memcg, true); + unlock_page(new_page); + new_page->mapping = NULL; + } +out: + VM_BUG_ON(!list_empty(&pagelist)); + /* TODO: tracepoints */ +} + +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + struct page *page = NULL; + struct radix_tree_iter iter; + void **slot; + int present, swap; + int node = NUMA_NO_NODE; + int result = SCAN_SUCCEED; + + present = 0; + swap = 0; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= start + HPAGE_PMD_NR) + break; + + page = radix_tree_deref_slot(slot); + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (radix_tree_exception(page)) { + if (++swap > khugepaged_max_ptes_swap) { + result = SCAN_EXCEED_SWAP_PTE; + break; + } + continue; + } + + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + break; + } + + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; + break; + } + khugepaged_node_load[node]++; + + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + break; + } + + if (page_count(page) != 1 + page_mapcount(page)) { + result = SCAN_PAGE_COUNT; + break; + } + + /* + * We probably should check if the page is referenced here, but + * nobody would transfer pte_young() to PageReferenced() for us. + * And rmap walk here is just too costly... + */ + + present++; + + if (need_resched()) { + cond_resched_rcu(); + slot = radix_tree_iter_next(&iter); + } + } + rcu_read_unlock(); + + if (result == SCAN_SUCCEED) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + result = SCAN_EXCEED_NONE_PTE; + } else { + node = khugepaged_find_target_node(); + collapse_shmem(mm, mapping, start, hpage, node); + } + } + + /* TODO: tracepoints */ +} +#else +static void khugepaged_scan_shmem(struct mm_struct *mm, + struct address_space *mapping, + pgoff_t start, struct page **hpage) +{ + BUILD_BUG(); +} +#endif + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + if (!hugepage_vma_check(vma)) { +skip: + progress++; + continue; + } + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + if (shmem_file(vma->vm_file)) { + struct file *file; + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + if (!shmem_huge_enabled(vma)) + goto skip; + file = get_file(vma->vm_file); + up_read(&mm->mmap_sem); + ret = 1; + khugepaged_scan_shmem(mm, file->f_mapping, + pgoff, hpage); + fput(file); + } else { + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + } + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + kthread_should_stop(); +} + +static void khugepaged_do_scan(void) +{ + struct page *hpage = NULL; + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + if (!khugepaged_prealloc_page(&hpage, &wait)) + break; + + cond_resched(); + + if (unlikely(kthread_should_stop() || try_to_freeze())) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + &hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } + + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); +} + +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + +static void khugepaged_wait_work(void) +{ + if (khugepaged_has_work()) { + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) + return; + + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; + wait_event_freezable_timeout(khugepaged_wait, + khugepaged_should_wakeup(), + scan_sleep_jiffies); + return; + } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, MAX_NICE); + + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +static void set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + + for_each_populated_zone(zone) + nr_zones++; + + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + + min_free_kbytes = recommended_min; + } + setup_per_zone_wmarks(); +} + +int start_stop_khugepaged(void) +{ + static struct task_struct *khugepaged_thread __read_mostly; + static DEFINE_MUTEX(khugepaged_mutex); + int err = 0; + + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled()) { + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (IS_ERR(khugepaged_thread)) { + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + goto fail; + } + + if (!list_empty(&khugepaged_scan.mm_head)) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } +fail: + mutex_unlock(&khugepaged_mutex); + return err; +} diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 04320d3adbef..086292f7c59d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg) * Wait before the first scan to allow the system to fully initialize. */ if (first_run) { + signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); first_run = 0; - ssleep(SECS_FIRST_SCAN); + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); } while (!kthread_should_stop()) { @@ -376,9 +376,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) - ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE | - FAULT_FLAG_REMOTE); + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); @@ -532,8 +531,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) void *expected_mapping; unsigned long kpfn; - expected_mapping = (void *)stable_node + - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + expected_mapping = (void *)((unsigned long)stable_node | + PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); page = pfn_to_page(kpfn); diff --git a/mm/memblock.c b/mm/memblock.c index ac1248933b31..ff5ff3b5f1ea 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,7 @@ #include <linux/seq_file.h> #include <linux/memblock.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #include <linux/io.h> #include "internal.h" @@ -584,6 +584,9 @@ repeat: nid, flags); } + if (!nr_new) + return 0; + /* * If this was the first round, resize array and repeat for actual * insertions; otherwise, merge and return. @@ -1024,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, *out_end = m_end; if (out_nid) *out_nid = m_nid; - idx_a++; + idx_a--; *idx = (u32)idx_a | (u64)idx_b << 32; return; } @@ -1462,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } -void __init memblock_enforce_memory_limit(phys_addr_t limit) +static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; struct memblock_region *r; - if (!limit) - return; - - /* find out max address */ + /* + * translate the memory @limit size into the max address within one of + * the memory memblock regions, if the @limit exceeds the total size + * of those regions, max_addr will keep original value ULLONG_MAX + */ for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; @@ -1479,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) limit -= r->size; } + return max_addr; +} + +void __init memblock_enforce_memory_limit(phys_addr_t limit) +{ + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == (phys_addr_t)ULLONG_MAX) + return; + /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); @@ -1486,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) (phys_addr_t)ULLONG_MAX); } +void __init memblock_mem_limit_remove_map(phys_addr_t limit) +{ + struct memblock_type *type = &memblock.memory; + phys_addr_t max_addr; + int i, ret, start_rgn, end_rgn; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == (phys_addr_t)ULLONG_MAX) + return; + + ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, + &start_rgn, &end_rgn); + if (ret) + return; + + /* remove all the MAP regions above the limit */ + for (i = end_rgn - 1; i >= start_rgn; i--) { + if (!memblock_is_nomap(&type->regions[i])) + memblock_remove_region(type, i); + } + /* truncate the reserved regions */ + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); +} + static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) { unsigned int left = 0, right = type->cnt; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac8664db3823..c265212bec8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = { * their hierarchy representation */ -struct mem_cgroup_tree_per_zone { +struct mem_cgroup_tree_per_node { struct rb_root rb_root; spinlock_t lock; }; -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - struct mem_cgroup_tree { struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; }; @@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* !CONFIG_SLOB */ -static struct mem_cgroup_per_zone * -mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - - return &memcg->nodeinfo[nid]->zoneinfo[zid]; -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_zone * -mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) +static struct mem_cgroup_per_node * +mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &memcg->nodeinfo[nid]->zoneinfo[zid]; + return memcg->nodeinfo[nid]; } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) +static struct mem_cgroup_tree_per_node * +soft_limit_tree_node(int nid) { - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static struct mem_cgroup_tree_per_zone * +static struct mem_cgroup_tree_per_node * soft_limit_tree_from_page(struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; + struct mem_cgroup_per_node *mz_node; if (mz->on_tree) return; @@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, return; while (*p) { parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); if (mz->usage_in_excess < mz_node->usage_in_excess) p = &(*p)->rb_left; @@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = true; } -static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { if (!mz->on_tree) return; @@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = false; } -static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { unsigned long flags; @@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); /* @@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - struct mem_cgroup_tree_per_zone *mctz; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_tree_per_node *mctz; + struct mem_cgroup_per_node *mz; + int nid; for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - mctz = soft_limit_tree_node_zone(nid, zid); - mem_cgroup_remove_exceeded(mz, mctz); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + mctz = soft_limit_tree_node(nid); + mem_cgroup_remove_exceeded(mz, mctz); } } -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; retry: mz = NULL; @@ -532,7 +515,7 @@ retry: if (!rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -546,10 +529,10 @@ done: return mz; } -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); @@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { unsigned long nr = 0; - int zid; + struct mem_cgroup_per_node *mz; + enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - nr += mz->lru_size[lru]; - } + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = mem_cgroup_nodeinfo(memcg, nid); + nr += mz->lru_size[lru]; } return nr; } @@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); if (reclaim) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); iter = &mz->iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) @@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) { struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_per_node *mz; + int nid; int i; while ((memcg = parent_mem_cgroup(memcg))) { for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } @@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg - * @zone: zone of the wanted lruvec - * @memcg: memcg of the wanted lruvec - * - * Returns the lru list vector holding pages for the given @zone and - * @mem. This can be the global zone lruvec, if the memory controller - * is disabled. - */ -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_zone *mz; - struct lruvec *lruvec; - - if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; - goto out; - } - - mz = mem_cgroup_zone_zoneinfo(memcg, zone); - lruvec = &mz->lruvec; -out: - /* - * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; - * and if offlined then reonlined, we need to reinitialize it. - */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; - return lruvec; -} - -/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -985,14 +929,14 @@ out: * and putback protocol: the LRU lock must be held, and the page must * either be PageLRU() or the caller must have isolated/allocated it. */ -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; struct mem_cgroup *memcg; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; + lruvec = &pgdat->lruvec; goto out; } @@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1012,8 +956,8 @@ out: * we have to be prepared to initialize lruvec->zone here; * and if offlined then reonlined, we need to reinitialize it. */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; + if (unlikely(lruvec->pgdat != pgdat)) + lruvec->pgdat = pgdat; return lruvec; } @@ -1030,17 +974,15 @@ out: void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; bool empty; - __update_lru_size(lruvec, lru, nr_pages); - if (mem_cgroup_disabled()) return; - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = mz->lru_size + lru; empty = list_empty(lruvec->lists + lru); @@ -1259,6 +1201,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = memcg, .gfp_mask = gfp_mask, .order = order, }; @@ -1275,13 +1218,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); goto unlock; } - check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1289,7 +1232,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task, totalpages)) { + switch (oom_scan_process_thread(&oc, task)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1329,7 +1272,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, memcg, + oom_kill_process(&oc, chosen, points, totalpages, "Memory cgroup out of memory"); } unlock: @@ -1432,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, + pg_data_t *pgdat, gfp_t gfp_mask, unsigned long *total_scanned) { @@ -1442,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, unsigned long excess; unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = 0, }; @@ -1472,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); + total += mem_cgroup_shrink_node(victim, gfp_mask, false, + pgdat, &nr_scanned); *total_scanned += nr_scanned; if (!soft_limit_excess(root_memcg)) break; @@ -2106,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated) { struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2125,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated) if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2272,20 +2215,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -/* +static inline bool memcg_kmem_bypass(void) +{ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + return false; +} + +/** + * memcg_kmem_get_cache: select the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. * - * If the cache does not exist yet, if we are the first user of it, - * we either create it immediately, if possible, or create it asynchronously - * in a workqueue. - * In the latter case, we will let the current allocation go through with - * the original cache. + * If the cache does not exist yet, if we are the first user of it, we + * create it asynchronously in a workqueue and let the current allocation + * go through with the original cache. * - * Can't be called in interrupt context or from kernel threads. - * This function needs to be called with rcu_read_lock() held. + * This function takes a reference to the cache it returns to assure it + * won't get destroyed while we are working with it. Once the caller is + * done with it, memcg_kmem_put_cache() must be called to release the + * reference. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2293,10 +2246,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) VM_BUG_ON(!is_root_cache(cachep)); - if (cachep->flags & SLAB_ACCOUNT) - gfp |= __GFP_ACCOUNT; - - if (!(gfp & __GFP_ACCOUNT)) + if (memcg_kmem_bypass()) return cachep; if (current->memcg_kmem_skip_account) @@ -2329,14 +2279,27 @@ out: return cachep; } -void __memcg_kmem_put_cache(struct kmem_cache *cachep) +/** + * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache + * @cachep: the cache returned by memcg_kmem_get_cache + */ +void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +/** + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * @memcg: memory cgroup to charge + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; struct page_counter *counter; @@ -2357,19 +2320,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, return 0; } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +/** + * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; + if (memcg_kmem_bypass()) + return 0; + memcg = get_mem_cgroup_from_mm(current->mm); if (!mem_cgroup_is_root(memcg)) - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } - -void __memcg_kmem_uncharge(struct page *page, int order) +/** + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order + */ +void memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2395,7 +2373,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock and migration entries setup in all page mappings. + * zone_lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2565,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; + struct mem_cgroup_per_node *mz, *next_mz = NULL; unsigned long reclaimed; int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_tree_per_node *mctz; unsigned long excess; unsigned long nr_scanned; if (order > 0) return 0; - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + mctz = soft_limit_tree_node(pgdat->node_id); /* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under @@ -2595,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; @@ -3216,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_VM { - int nid, zid; - struct mem_cgroup_per_zone *mz; + pg_data_t *pgdat; + struct mem_cgroup_per_node *mz; struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; - for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - rstat = &mz->lruvec.reclaim_stat; + for_each_online_pgdat(pgdat) { + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; - } + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; + } seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); @@ -4057,11 +4034,64 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->id.ref); +} + +static void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + if (atomic_dec_and_test(&memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + +static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - struct mem_cgroup_per_zone *mz; - int zone, tmp = node; + int tmp = node; /* * This routine is called against possible nodes. * But it's BUG to call kmalloc() against offline node. @@ -4076,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec); - mz->usage_in_excess = 0; - mz->on_tree = false; - mz->memcg = memcg; - } + lruvec_init(&pn->lruvec); + pn->usage_in_excess = 0; + pn->on_tree = false; + pn->memcg = memcg; + memcg->nodeinfo[node] = pn; return 0; } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { kfree(memcg->nodeinfo[node]); } @@ -4098,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) memcg_wb_domain_exit(memcg); for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); + free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); kfree(memcg); } @@ -4116,12 +4144,18 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg) return NULL; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto fail; + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto fail; for_each_node(node) - if (alloc_mem_cgroup_per_zone_info(memcg, node)) + if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; if (memcg_wb_domain_init(memcg, GFP_KERNEL)) @@ -4142,8 +4176,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); mem_cgroup_free(memcg); return NULL; } @@ -4206,12 +4243,11 @@ fail: return ERR_PTR(-ENOMEM); } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); return 0; } @@ -4234,6 +4270,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4345,7 +4383,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, #ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); @@ -4364,7 +4402,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, } #else static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { return NULL; } @@ -4407,7 +4445,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /** * mem_cgroup_move_account - move account of the page * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) + * @compound: charge the page as compound or small page * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -4529,7 +4567,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, addr, ptent, &ent); + page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); @@ -5133,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); @@ -5269,6 +5307,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * @mm: mm context of the victim * @gfp_mask: reclaim mode * @memcgp: charged memcg return + * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. @@ -5331,6 +5370,7 @@ out: * @page: page to charge * @memcg: memcg to charge the page to * @lrucare: page might be on LRU already + * @compound: charge the page as compound or small page * * Finalize a charge transaction started by mem_cgroup_try_charge(), * after page->mapping has been set up. This must happen atomically @@ -5382,6 +5422,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * mem_cgroup_cancel_charge - cancel a page charge * @page: page to charge * @memcg: memcg to charge the page to + * @compound: charge the page as compound or small page * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ @@ -5405,15 +5446,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, struct page *dummy_page) + unsigned long nr_huge, unsigned long nr_kmem, + struct page *dummy_page) { - unsigned long nr_pages = nr_anon + nr_file; + unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) + page_counter_uncharge(&memcg->kmem, nr_kmem); memcg_oom_recover(memcg); } @@ -5436,6 +5480,7 @@ static void uncharge_list(struct list_head *page_list) unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; + unsigned long nr_kmem = 0; unsigned long pgpgout = 0; struct list_head *next; struct page *page; @@ -5446,8 +5491,6 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { - unsigned int nr_pages = 1; - page = list_entry(next, struct page, lru); next = page->lru.next; @@ -5466,31 +5509,34 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); - pgpgout = nr_anon = nr_file = nr_huge = 0; + nr_huge, nr_kmem, page); + pgpgout = nr_anon = nr_file = + nr_huge = nr_kmem = 0; } memcg = page->mem_cgroup; } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - nr_huge += nr_pages; - } + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; - if (PageAnon(page)) - nr_anon += nr_pages; - else - nr_file += nr_pages; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + nr_huge += nr_pages; + } + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + pgpgout++; + } else + nr_kmem += 1 << compound_order(page); page->mem_cgroup = NULL; - - pgpgout++; } while (next != page_list); if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); + nr_huge, nr_kmem, page); } /** @@ -5712,18 +5758,12 @@ static int __init mem_cgroup_init(void) for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; - int zone; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct mem_cgroup_tree_per_zone *rtpz; - - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } + rtpn->rb_root = RB_ROOT; + spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } @@ -5756,6 +5796,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); @@ -5774,6 +5815,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); } /* @@ -5804,11 +5848,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) !page_counter_try_charge(&memcg->swap, 1, &counter)) return -ENOMEM; + mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); return 0; } @@ -5837,7 +5881,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) page_counter_uncharge(&memcg->memsw, 1); } mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2fcca6b0e005..de88f33519c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) * page->lru because it can be used in other hugepage operations, * such as __unmap_hugepage_range() and gather_surplus_pages(). * So instead we use page_mapping() and PageAnon(). - * We assume that this function is called with page lock held, - * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); @@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, @@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags) if (ret) { if (!list_empty(&pagelist)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } diff --git a/mm/memory.c b/mm/memory.c index cd1f29e4897e..4425b6059339 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -233,6 +233,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif + tlb->page_size = 0; __tlb_reset_range(tlb); } @@ -292,23 +293,31 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e * handling the additional races in SMP caused by other CPUs caching valid * mappings in their TLBs. Returns the number of free page slots left. * When out of page slots we must call tlb_flush_mmu(). + *returns true if the caller should flush. */ -int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + if (!tlb->page_size) + tlb->page_size = page_size; + else { + if (page_size != tlb->page_size) + return true; + } + batch = tlb->active; - batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) - return 0; + return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - return batch->max - batch->nr; + batch->pages[batch->nr++] = page; + return false; } #endif /* HAVE_GENERIC_MMU_GATHER */ @@ -1109,6 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + struct page *pending_page = NULL; again: init_rss_vec(rss); @@ -1132,7 +1142,7 @@ again: * unmap shared but keep private pages. */ if (details->check_mapping && - details->check_mapping != page->mapping) + details->check_mapping != page_rmapping(page)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1160,8 +1170,9 @@ again: page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(!__tlb_remove_page(tlb, page))) { + if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; + pending_page = page; addr += PAGE_SIZE; break; } @@ -1202,7 +1213,11 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - + if (pending_page) { + /* remove the page with new size */ + __tlb_remove_pte_page(tlb, pending_page); + pending_page = NULL; + } if (addr != end) goto again; } @@ -1479,7 +1494,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page); + page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; @@ -2055,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - struct page *page, int page_mkwrite, - int dirty_shared) - __releases(ptl) +static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, + struct page *page, int page_mkwrite, int dirty_shared) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2071,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, address, pte_pfn(orig_pte)); + flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry, 1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); + if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) + update_mmu_cache(vma, fe->address, fe->pte); + pte_unmap_unlock(fe->pte, fe->ptl); if (dirty_shared) { struct address_space *mapping; @@ -2122,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - pte_t orig_pte, struct page *old_page) +static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, + struct page *old_page) { + struct vm_area_struct *vma = fe->vma; + struct mm_struct *mm = vma->vm_mm; struct page *new_page = NULL; - spinlock_t *ptl = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ - const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, address); + new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); if (!new_page) goto oom; } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + fe->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address, vma); + cow_user_page(new_page, old_page, fe->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2158,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, /* * Re-check the pte - we dropped the lock */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) { + fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); + if (likely(pte_same(*fe->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2169,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, address, pte_pfn(orig_pte)); + flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2178,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address, false); + ptep_clear_flush_notify(vma, fe->address, fe->pte); + page_add_new_anon_rmap(new_page, vma, fe->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2187,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, page_table); + set_pte_at_notify(mm, fe->address, fe->pte, entry); + update_mmu_cache(vma, fe->address, fe->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2225,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (new_page) put_page(new_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2253,44 +2267,43 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, - pmd_t *pmd) +static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) { + struct vm_area_struct *vma = fe->vma; + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { struct vm_fault vmf = { .page = NULL, - .pgoff = linear_page_index(vma, address), - .virtual_address = (void __user *)(address & PAGE_MASK), + .pgoff = linear_page_index(vma, fe->address), + .virtual_address = + (void __user *)(fe->address & PAGE_MASK), .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); if (ret & VM_FAULT_ERROR) return ret; - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); /* * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*page_table, orig_pte)) { - pte_unmap_unlock(page_table, ptl); + if (!pte_same(*fe->pte, orig_pte)) { + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } } - return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, - NULL, 0, 0); + return wp_page_reuse(fe, orig_pte, NULL, 0, 0); } -static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, - pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, - struct page *old_page) - __releases(ptl) +static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, + struct page *old_page) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; int page_mkwrite = 0; get_page(old_page); @@ -2298,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(page_table, ptl); - tmp = do_page_mkwrite(vma, old_page, address); + pte_unmap_unlock(fe->pte, fe->ptl); + tmp = do_page_mkwrite(vma, old_page, fe->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(old_page); @@ -2311,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); put_page(old_page); return 0; } page_mkwrite = 1; } - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); } /* @@ -2344,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) - __releases(ptl) +static int do_wp_page(struct fault_env *fe, pte_t orig_pte) + __releases(fe->ptl) { + struct vm_area_struct *vma = fe->vma; struct page *old_page; - old_page = vm_normal_page(vma, address, orig_pte); + old_page = vm_normal_page(vma, fe->address, orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2362,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(mm, vma, address, page_table, ptl, - orig_pte, pmd); + return wp_pfn_shared(fe, orig_pte); - pte_unmap_unlock(page_table, ptl); - return wp_page_copy(mm, vma, address, page_table, pmd, - orig_pte, old_page); + pte_unmap_unlock(fe->pte, fe->ptl); + return wp_page_copy(fe, orig_pte, old_page); } /* @@ -2378,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); put_page(old_page); return 0; } @@ -2399,18 +2408,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(compound_head(old_page), - vma, address); + page_move_anon_rmap(old_page, vma); } unlock_page(old_page); - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); + return wp_page_reuse(fe, orig_pte, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(mm, vma, address, page_table, pmd, - ptl, orig_pte, old_page); + return wp_page_shared(fe, orig_pte, old_page); } /* @@ -2418,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ get_page(old_page); - pte_unmap_unlock(page_table, ptl); - return wp_page_copy(mm, vma, address, page_table, pmd, - orig_pte, old_page); + pte_unmap_unlock(fe->pte, fe->ptl); + return wp_page_copy(fe, orig_pte, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2508,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) +int do_swap_page(struct fault_env *fe, pte_t orig_pte) { - spinlock_t *ptl; + struct vm_area_struct *vma = fe->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2521,17 +2524,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, int exclusive = 0; int ret = 0; - if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, address, orig_pte, NULL); + print_bad_pte(vma, fe->address, orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2540,14 +2543,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page = lookup_swap_cache(entry); if (!page) { page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, address); + GFP_HIGHUSER_MOVABLE, vma, fe->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (likely(pte_same(*page_table, orig_pte))) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + if (likely(pte_same(*fe->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2556,7 +2560,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -2569,7 +2573,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } swapcache = page; - locked = lock_page_or_retry(page, mm, flags); + locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2586,14 +2590,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, address); + page = ksm_might_need_to_copy(page, vma, fe->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2601,8 +2606,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Back out if somebody else already faulted in this pte. */ - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*page_table, orig_pte))) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (unlikely(!pte_same(*fe->pte, orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2620,24 +2626,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * must be called after the swap_free(), or it will never succeed. */ - inc_mm_counter_fast(mm, MM_ANONPAGES); - dec_mm_counter_fast(mm, MM_SWAPENTS); + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - flags &= ~FAULT_FLAG_WRITE; + fe->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(mm, address, page_table, pte); + set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); if (page == swapcache) { - do_page_add_anon_rmap(page, vma, address, exclusive); + do_page_add_anon_rmap(page, vma, fe->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address, false); + page_add_new_anon_rmap(page, vma, fe->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2660,22 +2666,22 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(swapcache); } - if (flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (fe->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(fe, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, page_table); + update_mmu_cache(vma, fe->address, fe->pte); unlock: - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); out_page: unlock_page(page); out_release: @@ -2726,37 +2732,51 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags) +static int do_anonymous_page(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; struct mem_cgroup *memcg; struct page *page; - spinlock_t *ptl; pte_t entry; - pte_unmap(page_table); - /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, address) < 0) + if (check_stack_guard_page(vma, fe->address) < 0) return VM_FAULT_SIGSEGV; + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(fe->pmd))) + return 0; + /* Use the zero-page for reads */ - if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + if (!(fe->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), vma->vm_page_prot)); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_none(*fe->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(page_table, ptl); - return handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + pte_unmap_unlock(fe->pte, fe->ptl); + return handle_userfault(fe, VM_UFFD_MISSING); } goto setpte; } @@ -2764,11 +2784,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, address); + page = alloc_zeroed_user_highpage_movable(vma, fe->address); if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2782,30 +2802,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_none(*page_table)) + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + if (!pte_none(*fe->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(vma, address, flags, - VM_UFFD_MISSING); + return handle_userfault(fe, VM_UFFD_MISSING); } - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address, false); + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, fe->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(mm, address, page_table, entry); + set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, page_table); + update_mmu_cache(vma, fe->address, fe->pte); unlock: - pte_unmap_unlock(page_table, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2822,17 +2842,16 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_area_struct *vma, unsigned long address, - pgoff_t pgoff, unsigned int flags, - struct page *cow_page, struct page **page, - void **entry) +static int __do_fault(struct fault_env *fe, pgoff_t pgoff, + struct page *cow_page, struct page **page, void **entry) { + struct vm_area_struct *vma = fe->vma; struct vm_fault vmf; int ret; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); vmf.pgoff = pgoff; - vmf.flags = flags; + vmf.flags = fe->flags; vmf.page = NULL; vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; @@ -2861,41 +2880,168 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } +static int pte_alloc_one_map(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + if (!pmd_none(*fe->pmd)) + goto map_pte; + if (fe->prealloc_pte) { + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) { + spin_unlock(fe->ptl); + goto map_pte; + } + + atomic_long_inc(&vma->vm_mm->nr_ptes); + pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); + spin_unlock(fe->ptl); + fe->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + return VM_FAULT_OOM; + } +map_pte: + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + return VM_FAULT_NOPAGE; + + fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, + &fe->ptl); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + +#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) +static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, + unsigned long haddr) +{ + if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != + (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) + return false; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return false; + return true; +} + +static int do_set_pmd(struct fault_env *fe, struct page *page) +{ + struct vm_area_struct *vma = fe->vma; + bool write = fe->flags & FAULT_FLAG_WRITE; + unsigned long haddr = fe->address & HPAGE_PMD_MASK; + pmd_t entry; + int i, ret; + + if (!transhuge_vma_suitable(vma, haddr)) + return VM_FAULT_FALLBACK; + + ret = VM_FAULT_FALLBACK; + page = compound_head(page); + + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + if (unlikely(!pmd_none(*fe->pmd))) + goto out; + + for (i = 0; i < HPAGE_PMD_NR; i++) + flush_icache_page(vma, page + i); + + entry = mk_huge_pmd(page, vma->vm_page_prot); + if (write) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); + page_add_file_rmap(page, true); + + set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + + update_mmu_cache_pmd(vma, haddr, fe->pmd); + + /* fault is handled */ + ret = 0; + count_vm_event(THP_FILE_MAPPED); +out: + spin_unlock(fe->ptl); + return ret; +} +#else +static int do_set_pmd(struct fault_env *fe, struct page *page) +{ + BUILD_BUG(); + return 0; +} +#endif + /** - * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * alloc_set_pte - setup new PTE entry for given page and add reverse page + * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @vma: virtual memory area - * @address: user virtual address + * @fe: fault environment + * @memcg: memcg to charge page (only for private mappings) * @page: page to map - * @pte: pointer to target page table entry - * @write: true, if new entry is writable - * @anon: true, if it's anonymous page * - * Caller must hold page table lock relevant for @pte. + * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon) +int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, + struct page *page) { + struct vm_area_struct *vma = fe->vma; + bool write = fe->flags & FAULT_FLAG_WRITE; pte_t entry; + int ret; + + if (pmd_none(*fe->pmd) && PageTransCompound(page) && + IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + /* THP on COW? */ + VM_BUG_ON_PAGE(memcg, page); + + ret = do_set_pmd(fe, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (!fe->pte) { + ret = pte_alloc_one_map(fe); + if (ret) + return ret; + } + + /* Re-check under ptl */ + if (unlikely(!pte_none(*fe->pte))) + return VM_FAULT_NOPAGE; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { + /* copy-on-write page */ + if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address, false); + page_add_new_anon_rmap(page, vma, fe->address, false); + mem_cgroup_commit_charge(page, memcg, false, false); + lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page); + page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, fe->address, fe->pte); + + return 0; } static unsigned long fault_around_bytes __read_mostly = @@ -2962,57 +3108,66 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static void do_fault_around(struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pgoff_t pgoff, unsigned int flags) +static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) { - unsigned long start_addr, nr_pages, mask; - pgoff_t max_pgoff; - struct vm_fault vmf; - int off; + unsigned long address = fe->address, nr_pages, mask; + pgoff_t end_pgoff; + int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - start_addr = max(address & mask, vma->vm_start); - off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); - pte -= off; - pgoff -= off; + fe->address = max(address & mask, fe->vma->vm_start); + off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + start_pgoff -= off; /* - * max_pgoff is either end of page table or end of vma - * or fault_around_pages() from pgoff, depending what is nearest. + * end_pgoff is either end of page table or end of vma + * or fault_around_pages() from start_pgoff, depending what is nearest. */ - max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + end_pgoff = start_pgoff - + ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + nr_pages - 1); + end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + start_pgoff + nr_pages - 1); - /* Check if it makes any sense to call ->map_pages */ - while (!pte_none(*pte)) { - if (++pgoff > max_pgoff) - return; - start_addr += PAGE_SIZE; - if (start_addr >= vma->vm_end) - return; - pte++; + if (pmd_none(*fe->pmd)) { + fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); + smp_wmb(); /* See comment in __pte_alloc() */ } - vmf.virtual_address = (void __user *) start_addr; - vmf.pte = pte; - vmf.pgoff = pgoff; - vmf.max_pgoff = max_pgoff; - vmf.flags = flags; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vma->vm_ops->map_pages(vma, &vmf); + fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + /* Huge page is mapped? Page fault is solved */ + if (pmd_trans_huge(*fe->pmd)) { + ret = VM_FAULT_NOPAGE; + goto out; + } + + /* ->map_pages() haven't done anything useful. Cold page cache? */ + if (!fe->pte) + goto out; + + /* check if the page fault is solved */ + fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*fe->pte)) + ret = VM_FAULT_NOPAGE; + pte_unmap_unlock(fe->pte, fe->ptl); +out: + fe->address = address; + fe->pte = NULL; + return ret; } -static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page; - spinlock_t *ptl; - pte_t *pte; int ret = 0; /* @@ -3021,85 +3176,64 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - do_fault_around(vma, address, pte, pgoff, flags); - if (!pte_same(*pte, orig_pte)) - goto unlock_out; - pte_unmap_unlock(pte, ptl); + ret = do_fault_around(fe, pgoff); + if (ret) + return ret; } - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); + ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); - unlock_page(fault_page); - put_page(fault_page); - return ret; - } - do_set_pte(vma, address, fault_page, pte, false, false); + ret |= alloc_set_pte(fe, NULL, fault_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); unlock_page(fault_page); -unlock_out: - pte_unmap_unlock(pte, ptl); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + put_page(fault_page); return ret; } -static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page, *new_page; void *fault_entry; struct mem_cgroup *memcg; - spinlock_t *ptl; - pte_t *pte; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { + if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { put_page(new_page); return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, - &fault_entry); + ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, address, vma); + copy_user_highpage(new_page, fault_page, fe->address, vma); __SetPageUptodate(new_page); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, - pgoff); - } - goto uncharge_out; - } - do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); - pte_unmap_unlock(pte, ptl); + ret |= alloc_set_pte(fe, memcg, new_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); } + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg, false); @@ -3107,18 +3241,15 @@ uncharge_out: return ret; } -static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) { + struct vm_area_struct *vma = fe->vma; struct page *fault_page; struct address_space *mapping; - spinlock_t *ptl; - pte_t *pte; int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); + ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3128,7 +3259,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->page_mkwrite) { unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, address); + tmp = do_page_mkwrite(vma, fault_page, fe->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(fault_page); @@ -3136,15 +3267,15 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (unlikely(!pte_same(*pte, orig_pte))) { - pte_unmap_unlock(pte, ptl); + ret |= alloc_set_pte(fe, NULL, fault_page); + if (fe->pte) + pte_unmap_unlock(fe->pte, fe->ptl); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) { unlock_page(fault_page); put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false); - pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) dirtied = 1; @@ -3176,23 +3307,19 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - unsigned int flags, pte_t orig_pte) +static int do_fault(struct fault_env *fe) { - pgoff_t pgoff = linear_page_index(vma, address); + struct vm_area_struct *vma = fe->vma; + pgoff_t pgoff = linear_page_index(vma, fe->address); - pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(flags & FAULT_FLAG_WRITE)) - return do_read_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); + if (!(fe->flags & FAULT_FLAG_WRITE)) + return do_read_fault(fe, pgoff); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(mm, vma, address, pmd, pgoff, flags, - orig_pte); - return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_cow_fault(fe, pgoff); + return do_shared_fault(fe, pgoff); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3210,11 +3337,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +static int do_numa_page(struct fault_env *fe, pte_t pte) { + struct vm_area_struct *vma = fe->vma; struct page *page = NULL; - spinlock_t *ptl; int page_nid = -1; int last_cpupid; int target_nid; @@ -3234,10 +3360,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) { - pte_unmap_unlock(ptep, ptl); + fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + if (unlikely(!pte_same(*fe->pte, pte))) { + pte_unmap_unlock(fe->pte, fe->ptl); goto out; } @@ -3246,18 +3372,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(mm, addr, ptep, pte); - update_mmu_cache(vma, addr, ptep); + set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + update_mmu_cache(vma, fe->address, fe->pte); - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(vma, fe->address, pte); if (!page) { - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } @@ -3281,8 +3407,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); - pte_unmap_unlock(ptep, ptl); + target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + &flags); + pte_unmap_unlock(fe->pte, fe->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3302,24 +3429,29 @@ out: return 0; } -static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) +static int create_huge_pmd(struct fault_env *fe) { + struct vm_area_struct *vma = fe->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + return do_huge_pmd_anonymous_page(fe); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, + fe->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, pmd_t orig_pmd, - unsigned int flags) +static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) { - if (vma_is_anonymous(vma)) - return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + if (vma_is_anonymous(fe->vma)) + return do_huge_pmd_wp_page(fe, orig_pmd); + if (fe->vma->vm_ops->pmd_fault) + return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, + fe->flags); + + /* COW handled on pte level: split pmd */ + VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); + split_huge_pmd(fe->vma, fe->pmd, fe->address); + return VM_FAULT_FALLBACK; } @@ -3332,59 +3464,79 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with pte unmapped and unlocked. + * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow + * concurrent faults). * - * The mmap_sem may have been released depending on flags and our - * return value. See filemap_fault() and __lock_page_or_retry(). + * The mmap_sem may have been released depending on flags and our return value. + * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +static int handle_pte_fault(struct fault_env *fe) { pte_t entry; - spinlock_t *ptl; - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, - * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. - * The code below just needs a consistent view for the ifs and - * we later double check anyway with the ptl lock held. So here - * a barrier will do. - */ - entry = *pte; - barrier(); - if (!pte_present(entry)) { + if (unlikely(pmd_none(*fe->pmd))) { + /* + * Leave __pte_alloc() until later: because vm_ops->fault may + * want to allocate huge page, and if we expose page table + * for an instant, it will be difficult to retract from + * concurrent faults and from rmap lookups. + */ + fe->pte = NULL; + } else { + /* See comment in pte_alloc_one_map() */ + if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + return 0; + /* + * A regular pmd is established and it can't morph into a huge + * pmd from under us anymore at this point because we hold the + * mmap_sem read mode and khugepaged takes it in write mode. + * So now it's safe to run pte_offset_map(). + */ + fe->pte = pte_offset_map(fe->pmd, fe->address); + + entry = *fe->pte; + + /* + * some architectures can have larger ptes than wordsize, + * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and + * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee + * atomic accesses. The code below just needs a consistent + * view for the ifs and we later double check anyway with the + * ptl lock held. So here a barrier will do. + */ + barrier(); if (pte_none(entry)) { - if (vma_is_anonymous(vma)) - return do_anonymous_page(mm, vma, address, - pte, pmd, flags); - else - return do_fault(mm, vma, address, pte, pmd, - flags, entry); + pte_unmap(fe->pte); + fe->pte = NULL; } - return do_swap_page(mm, vma, address, - pte, pmd, flags, entry); } + if (!fe->pte) { + if (vma_is_anonymous(fe->vma)) + return do_anonymous_page(fe); + else + return do_fault(fe); + } + + if (!pte_present(entry)) + return do_swap_page(fe, entry); + if (pte_protnone(entry)) - return do_numa_page(mm, vma, address, entry, pte, pmd); + return do_numa_page(fe, entry); - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - if (unlikely(!pte_same(*pte, entry))) + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + if (unlikely(!pte_same(*fe->pte, entry))) goto unlock; - if (flags & FAULT_FLAG_WRITE) { + if (fe->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); + return do_wp_page(fe, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, pte); + if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, + fe->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(fe->vma, fe->address, fe->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3392,11 +3544,11 @@ static int handle_pte_fault(struct mm_struct *mm, * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vma, address); + if (fe->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(fe->vma, fe->address); } unlock: - pte_unmap_unlock(pte, ptl); + pte_unmap_unlock(fe->pte, fe->ptl); return 0; } @@ -3406,87 +3558,51 @@ unlock: * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { + struct fault_env fe = { + .vma = vma, + .address = address, + .flags = flags, + }; + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) - return VM_FAULT_SIGSEGV; - - if (unlikely(is_vm_hugetlb_page(vma))) - return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) + fe.pmd = pmd_alloc(mm, pud, address); + if (!fe.pmd) return VM_FAULT_OOM; - if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(mm, vma, address, pmd, flags); + if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&fe); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *pmd; + pmd_t orig_pmd = *fe.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - unsigned int dirty = flags & FAULT_FLAG_WRITE; - if (pmd_protnone(orig_pmd)) - return do_huge_pmd_numa_page(mm, vma, address, - orig_pmd, pmd); + return do_huge_pmd_numa_page(&fe, orig_pmd); - if (dirty && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(mm, vma, address, pmd, - orig_pmd, flags); + if ((fe.flags & FAULT_FLAG_WRITE) && + !pmd_write(orig_pmd)) { + ret = wp_huge_pmd(&fe, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(mm, vma, address, pmd, - orig_pmd, dirty); + huge_pmd_set_accessed(&fe, orig_pmd); return 0; } } } - /* - * Use pte_alloc() instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pte_alloc(mm, pmd, address))) - return VM_FAULT_OOM; - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd - * didn't become pmd_trans_huge under us and then back to pmd_none, as - * a result of MADV_DONTNEED running immediately after a huge pmd fault - * in a different thread of this mm, in turn leading to a misleading - * pmd_trans_huge() retval. All we have to ensure is that it is a - * regular pmd that we can walk with pte_offset_map() and we can do that - * through an atomic read in C, which is what pmd_trans_unstable() - * provides. - */ - if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + return handle_pte_fault(&fe); } /* @@ -3495,15 +3611,15 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { int ret; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -3515,7 +3631,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_USER) mem_cgroup_oom_enable(); - ret = __handle_mm_fault(mm, vma, address, flags); + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { mem_cgroup_oom_disable(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3cbdcaff2a5..3894b65b1555 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -449,6 +449,25 @@ out_fail: return -1; } +static struct zone * __meminit move_pfn_range(int zone_shift, + unsigned long start_pfn, unsigned long end_pfn) +{ + struct zone *zone = page_zone(pfn_to_page(start_pfn)); + int ret = 0; + + if (zone_shift < 0) + ret = move_pfn_range_left(zone + zone_shift, zone, + start_pfn, end_pfn); + else if (zone_shift) + ret = move_pfn_range_right(zone, zone + zone_shift, + start_pfn, end_pfn); + + if (ret) + return NULL; + + return zone + zone_shift; +} + static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -1028,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } +int zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target) +{ + struct zone *zone = page_zone(pfn_to_page(pfn)); + enum zone_type idx = zone_idx(zone); + int i; + + if (idx < target) { + /* pages must be at end of current zone */ + if (pfn + nr_pages != zone_end_pfn(zone)) + return 0; + + /* no zones in use between current zone and target */ + for (i = idx + 1; i < target; i++) + if (zone_is_initialized(zone - idx + i)) + return 0; + } + + if (target < idx) { + /* pages must be at beginning of current zone */ + if (pfn != zone->zone_start_pfn) + return 0; + + /* no zones in use between current zone and target */ + for (i = target + 1; i < idx; i++) + if (zone_is_initialized(zone - idx + i)) + return 0; + } + + return target - idx; +} /* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) @@ -1039,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + int zone_shift = 0; /* * This doesn't need a lock to do pfn_to_page(). @@ -1052,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ !can_online_high_movable(zone)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL && - zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - return -EINVAL; - } - if (online_type == MMOP_ONLINE_MOVABLE && - zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - return -EINVAL; - } + if (online_type == MMOP_ONLINE_KERNEL) + zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); + else if (online_type == MMOP_ONLINE_MOVABLE) + zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); - /* Previous code may changed the zone of the pfn range */ - zone = page_zone(pfn_to_page(pfn)); + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); + if (!zone) + return -EINVAL; arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -1163,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) arch_refresh_nodedata(nid, pgdat); } else { - /* Reset the nr_zones and classzone_idx to 0 before reuse */ + /* Reset the nr_zones, order and classzone_idx before reuse */ pgdat->nr_zones = 0; - pgdat->classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ @@ -1501,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } +static struct page *new_node_page(struct page *page, unsigned long private, + int **result) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + int nid = page_to_nid(page); + nodemask_t nmask = node_online_map; + struct page *new_page; + + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node_in(nid, nmask)); + + node_clear(nid, nmask); + if (PageHighMem(page) + || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) + gfp_mask |= __GFP_HIGHMEM; + + new_page = __alloc_pages_nodemask(gfp_mask, 0, + node_zonelist(nid, gfp_mask), &nmask); + if (!new_page) + new_page = __alloc_pages(gfp_mask, 0, + node_zonelist(nid, gfp_mask)); + + return new_page; +} + #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -1540,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } else { @@ -1564,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } - /* - * alloc_migrate_target should be improooooved!! - * migrate_pages returns # of failed pages. - */ - ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, + /* Allocate a new page from the nearest neighbor node */ + ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 297d6854f849..d8c4e38fb5f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -512,6 +512,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, } } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -529,7 +531,7 @@ retry: nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; - if (PageTransCompound(page) && PageAnon(page)) { + if (PageTransCompound(page)) { get_page(page); pte_unmap_unlock(pte, ptl); lock_page(page); @@ -960,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } } diff --git a/mm/mempool.c b/mm/mempool.c index 8f65464da5de..47a659dedd44 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. + * Note: using __GFP_ZERO is not supported. */ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; - /* If oom killed, memory reserves are essential to prevent livelock */ - VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); - /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: - if (likely(pool->curr_nr)) { - /* - * Don't allocate from emergency reserves if there are - * elements available. This check is racy, but it will - * be rechecked each loop. - */ - gfp_temp |= __GFP_NOMEMALLOC; - } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -359,12 +348,11 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { + if (gfp_temp != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } - gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { diff --git a/mm/migrate.c b/mm/migrate.c index bd3fdc202e8b..f7ee04a5ae27 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -31,6 +31,7 @@ #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/backing-dev.h> +#include <linux/compaction.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> @@ -73,6 +74,81 @@ int migrate_prep_local(void) return 0; } +bool isolate_movable_page(struct page *page, isolate_mode_t mode) +{ + struct address_space *mapping; + + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a movable page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (unlikely(!get_page_unless_zero(page))) + goto out; + + /* + * Check PageMovable before holding a PG_lock because page's owner + * assumes anybody doesn't touch PG_lock of newly allocated page + * so unconditionally grapping the lock ruins page's owner side. + */ + if (unlikely(!__PageMovable(page))) + goto out_putpage; + /* + * As movable pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the releasing a page. + * + * In order to avoid having an already isolated movable page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released, + * lets be sure we have the page lock + * before proceeding with the movable page isolation steps. + */ + if (unlikely(!trylock_page(page))) + goto out_putpage; + + if (!PageMovable(page) || PageIsolated(page)) + goto out_no_isolated; + + mapping = page_mapping(page); + VM_BUG_ON_PAGE(!mapping, page); + + if (!mapping->a_ops->isolate_page(page, mode)) + goto out_no_isolated; + + /* Driver shouldn't use PG_isolated bit of page->flags */ + WARN_ON_ONCE(PageIsolated(page)); + __SetPageIsolated(page); + unlock_page(page); + + return true; + +out_no_isolated: + unlock_page(page); +out_putpage: + put_page(page); +out: + return false; +} + +/* It should be called on page which is PG_movable */ +void putback_movable_page(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + mapping = page_mapping(page); + mapping->a_ops->putback_page(page); + __ClearPageIsolated(page); +} + /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. @@ -92,12 +168,25 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(isolated_balloon_page(page))) - balloon_page_putback(page); - else + /* + * We isolated non-lru movable page so here we can use + * __PageMovable because LRU page's mapping cannot have + * PAGE_MAPPING_MOVABLE. + */ + if (unlikely(__PageMovable(page))) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } else { putback_lru_page(page); + } } } @@ -170,7 +259,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr, false); else - page_add_file_rmap(new); + page_add_file_rmap(new, false); if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -412,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping, * new page and drop references to the old page. * * Note that anonymous pages are accounted for - * via NR_FILE_PAGES and NR_ANON_PAGES if they + * via NR_FILE_PAGES and NR_ANON_MAPPED if they * are mapped to swap space. */ if (newzone != oldzone) { - __dec_zone_state(oldzone, NR_FILE_PAGES); - __inc_zone_state(newzone, NR_FILE_PAGES); + __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); + __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_zone_state(oldzone, NR_SHMEM); - __inc_zone_state(newzone, NR_SHMEM); + __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); + __inc_node_state(newzone->zone_pgdat, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { - __dec_zone_state(oldzone, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_FILE_DIRTY); + __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); + __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); + __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); + __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); } } local_irq_enable(); @@ -594,7 +685,7 @@ EXPORT_SYMBOL(migrate_page_copy); ***********************************************************/ /* - * Common logic to directly migrate a single page suitable for + * Common logic to directly migrate a single LRU page suitable for * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. @@ -757,33 +848,72 @@ static int move_to_new_page(struct page *newpage, struct page *page, enum migrate_mode mode) { struct address_space *mapping; - int rc; + int rc = -EAGAIN; + bool is_lru = !__PageMovable(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); mapping = page_mapping(page); - if (!mapping) - rc = migrate_page(mapping, newpage, page, mode); - else if (mapping->a_ops->migratepage) + + if (likely(is_lru)) { + if (!mapping) + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * provide a migratepage callback. Anonymous pages + * are part of swap space which also has its own + * migratepage callback. This is the most common path + * for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + else + rc = fallback_migrate_page(mapping, newpage, + page, mode); + } else { /* - * Most pages have a mapping and most filesystems provide a - * migratepage callback. Anonymous pages are part of swap - * space which also has its own migratepage callback. This - * is the most common path for page migration. + * In case of non-lru page, it could be released after + * isolation step. In that case, we shouldn't try migration. */ - rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); - else - rc = fallback_migrate_page(mapping, newpage, page, mode); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + if (!PageMovable(page)) { + rc = MIGRATEPAGE_SUCCESS; + __ClearPageIsolated(page); + goto out; + } + + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && + !PageIsolated(page)); + } /* * When successful, old pagecache page->mapping must be cleared before * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (!PageAnon(page)) + if (__PageMovable(page)) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + /* + * We clear PG_movable under page_lock so any compactor + * cannot try to migrate this page. + */ + __ClearPageIsolated(page); + } + + /* + * Anonymous and movable page->mapping will be cleard by + * free_pages_prepare so don't reset it here for keeping + * the type to work PageAnon, for example. + */ + if (!PageMappingFlags(page)) page->mapping = NULL; } +out: return rc; } @@ -793,6 +923,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(page); if (!trylock_page(page)) { if (!force || mode == MIGRATE_ASYNC) @@ -861,15 +992,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (unlikely(!trylock_page(newpage))) goto out_unlock; - if (unlikely(isolated_balloon_page(page))) { - /* - * A ballooned page does not need any special attention from - * physical to virtual reverse mapping procedures. - * Skip any attempt to unmap PTEs or to remap swap cache, - * in order to avoid burning cycles at rmap level, and perform - * the page migration right away (proteced by page lock). - */ - rc = balloon_page_migrate(newpage, page, mode); + if (unlikely(!is_lru)) { + rc = move_to_new_page(newpage, page, mode); goto out_unlock_both; } @@ -915,6 +1039,19 @@ out_unlock: put_anon_vma(anon_vma); unlock_page(page); out: + /* + * If migration is successful, decrease refcount of the newpage + * which will not free the page because new page owner increased + * refcounter. As well, if it is LRU page, add the page to LRU + * list in here. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + if (unlikely(__PageMovable(newpage))) + put_page(newpage); + else + putback_lru_page(newpage); + } + return rc; } @@ -948,6 +1085,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ + ClearPageActive(page); + ClearPageUnevictable(page); + if (unlikely(__PageMovable(page))) { + lock_page(page); + if (!PageMovable(page)) + __ClearPageIsolated(page); + unlock_page(page); + } + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); goto out; } @@ -960,10 +1109,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) { - put_new_page = NULL; + if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); - } out: if (rc != -EAGAIN) { @@ -974,35 +1121,47 @@ out: * restored. */ list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + } + + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + put_page(page); + if (reason == MR_MEMORY_FAILURE) { /* - * With this release, we free successfully migrated - * page and set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, it's how - * HWPoison flag works at the moment. + * Set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, + * it's how HWPoison flag works at the moment. */ - put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); - } else - putback_lru_page(page); - } + } + } else { + if (rc != -EAGAIN) { + if (likely(!__PageMovable(page))) { + putback_lru_page(page); + goto put_new; + } - /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, putback_lru_page() will drop the reference grabbed - * during isolation. - */ - if (put_new_page) - put_new_page(newpage, private); - else if (unlikely(__is_movable_balloon_page(newpage))) { - /* drop our reference, page already in the balloon */ - put_page(newpage); - } else - putback_lru_page(newpage); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } +put_new: + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); + } if (result) { if (rc) @@ -1303,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } put_and_set: @@ -1569,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) { int z; + + if (!pgdat_reclaimable(pgdat)) + return false; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; - if (!zone_reclaimable(zone)) - continue; - /* Avoid waking kswapd by allocating pages_to_migrate pages. */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + @@ -1671,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) } page_lru = page_is_file_cache(page); - mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); /* @@ -1729,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } @@ -1774,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, + (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); if (!new_page) goto out_fail; @@ -1822,15 +1982,14 @@ fail_putback: /* Retake the callers reference and putback on LRU */ get_page(page); putback_lru_page(page); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); goto out_unlock; } orig_entry = *pmd; - entry = mk_pmd(new_page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* @@ -1874,7 +2033,7 @@ fail_putback: count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); return isolated; diff --git a/mm/mlock.c b/mm/mlock.c index ef8dc9f395c4..14645be06e30 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (getpage) get_page(page); ClearPageLRU(page); @@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page) * might otherwise copy PageMlocked to part of the tail pages before * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); nr_pages = hpage_nr_pages(page); if (!TestClearPageMlocked(page)) @@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page) __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); __munlock_isolated_page(page); goto out; } __munlock_isolation_failed(page); unlock_out: - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); out: return nr_pages - 1; @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_init(&pvec_putback, 0); /* Phase 1: page isolation */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) } delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmap.c b/mm/mmap.c index de2c1769cc68..d44bee96a5fe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -25,6 +25,7 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> @@ -620,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next; - struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -630,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; if (next && !insert) { - struct vm_area_struct *exporter = NULL; + struct vm_area_struct *exporter = NULL, *importer = NULL; if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). */ -again: remove_next = 1 + (end > next->vm_end); + remove_next = 1 + (end > next->vm_end); end = next->vm_end; exporter = next; importer = vma; + + /* + * If next doesn't have anon_vma, import from vma after + * next, if the vma overlaps with it. + */ + if (remove_next == 2 && next && !next->anon_vma) + exporter = next->vm_next; + } else if (end > next->vm_start) { /* * vma expands, overlapping part of the next: @@ -674,6 +682,8 @@ again: remove_next = 1 + (end > next->vm_end); return error; } } +again: + vma_adjust_trans_huge(vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -695,8 +705,6 @@ again: remove_next = 1 + (end > next->vm_end); } } - vma_adjust_trans_huge(vma, start, end, adjust_next); - anon_vma = vma->anon_vma; if (!anon_vma && adjust_next) anon_vma = next->anon_vma; @@ -795,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end); * up the code too much to do both in one go. */ next = vma->vm_next; - if (remove_next == 2) + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; goto again; + } else if (next) vma_gap_update(next); else @@ -1897,8 +1908,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; + if (file) { + if (file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + } else if (flags & MAP_SHARED) { + /* + * mmap_region() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge. + * do_mmap_pgoff() will clear pgoff, so match alignment. + */ + pgoff = 0; + get_area = shmem_get_unmapped_area; + } + addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; @@ -2591,6 +2613,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* drop PG_Mlocked flag for over-mapped range */ for (tmp = vma; tmp->vm_start >= start + size; tmp = tmp->vm_next) { + /* + * Split pmd and munlock page on the border + * of the range. + */ + vma_adjust_trans_huge(tmp, start, start + size, 0); + munlock_vma_pages_range(tmp, max(tmp->vm_start, start), min(tmp->vm_end, start + size)); @@ -2943,9 +2971,19 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } +static int special_mapping_mremap(struct vm_area_struct *new_vma) +{ + struct vm_special_mapping *sm = new_vma->vm_private_data; + + if (sm->mremap) + return sm->mremap(sm, new_vma); + return 0; +} + static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, + .mremap = special_mapping_mremap, .name = special_mapping_name, }; diff --git a/mm/mprotect.c b/mm/mprotect.c index 5019a1ef2848..a4830f0325fe 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -163,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { split_huge_pmd(vma, pmd, addr); - if (pmd_none(*pmd)) + if (pmd_trans_unstable(pmd)) continue; } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, diff --git a/mm/mremap.c b/mm/mremap.c index 1f157adfdaf9..da22ad2a5678 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -210,9 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, } } split_huge_pmd(vma, old_pmd, old_addr); - if (pmd_none(*old_pmd)) + if (pmd_trans_unstable(old_pmd)) continue; - VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; diff --git a/mm/nommu.c b/mm/nommu.c index c2e58880207f..95daf81a4855 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct fault_env *fe, + pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ddf74487f848..7d0a275df822 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, /* * Do not even consider tasks which are explicitly marked oom - * unkillable or have been already oom reaped. + * unkillable or have been already oom reaped or the are in + * the middle of vfork */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags)) { + test_bit(MMF_OOM_REAPED, &p->mm->flags) || + in_vfork(p)) { task_unlock(p); return 0; } @@ -274,17 +276,29 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, #endif enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task, unsigned long totalpages) + struct task_struct *task) { if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; /* * This task already has access to memory reserves and is being killed. - * Don't allow any other task to have access to the reserves. + * Don't allow any other task to have access to the reserves unless + * the task has MMF_OOM_REAPED because chances that it would release + * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) - return OOM_SCAN_ABORT; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { + struct task_struct *p = find_lock_task_mm(task); + enum oom_scan_t ret = OOM_SCAN_ABORT; + + if (p) { + if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) + ret = OOM_SCAN_CONTINUE; + task_unlock(p); + } + + return ret; + } /* * If task is allocating a lot of memory and has been marked to be @@ -311,7 +325,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc, for_each_process(p) { unsigned int points; - switch (oom_scan_process_thread(oc, p, totalpages)) { + switch (oom_scan_process_thread(oc, p)) { case OOM_SCAN_SELECT: chosen = p; chosen_points = ULONG_MAX; @@ -383,8 +397,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } -static void dump_header(struct oom_control *oc, struct task_struct *p, - struct mem_cgroup *memcg) +static void dump_header(struct oom_control *oc, struct task_struct *p) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, @@ -392,12 +405,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p, cpuset_print_current_mems_allowed(); dump_stack(); - if (memcg) - mem_cgroup_print_oom_info(memcg, p); + if (oc->memcg) + mem_cgroup_print_oom_info(oc->memcg, p); else show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(memcg, oc->nodemask); + dump_tasks(oc->memcg, oc->nodemask); } /* @@ -416,7 +429,7 @@ bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { struct task_struct *t; @@ -453,7 +466,7 @@ static bool __oom_reap_task(struct task_struct *tsk) * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: * __oom_reap_task exit_mm - * atomic_inc_not_zero + * mmget_not_zero * mmput * atomic_dec_and_test * exit_oom_victim @@ -475,12 +488,22 @@ static bool __oom_reap_task(struct task_struct *tsk) if (!p) goto unlock_oom; mm = p->mm; - atomic_inc(&mm->mm_users); + atomic_inc(&mm->mm_count); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto unlock_oom; + goto mm_drop; + } + + /* + * increase mm_users only after we know we will reap something so + * that the mmput_async is called only when we have reaped something + * and delayed __mmput doesn't matter that much + */ + if (!mmget_not_zero(mm)) { + up_read(&mm->mmap_sem); + goto mm_drop; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -522,15 +545,16 @@ static bool __oom_reap_task(struct task_struct *tsk) * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); -unlock_oom: - mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ - if (mm) - mmput_async(mm); + mmput_async(mm); +mm_drop: + mmdrop(mm); +unlock_oom: + mutex_unlock(&oom_lock); return ret; } @@ -544,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts > MAX_OOM_REAP_RETRIES) { + struct task_struct *p; + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); + + /* + * If we've already tried to reap this task in the past and + * failed it probably doesn't make much sense to try yet again + * so hide the mm from the oom killer so that it can move on + * to another task with a different mm struct. + */ + p = find_lock_task_mm(tsk); + if (p) { + if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { + pr_info("oom_reaper: giving up pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + set_bit(MMF_OOM_REAPED, &p->mm->flags); + } + task_unlock(p); + } + debug_show_all_locks(); } @@ -584,7 +627,7 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -602,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } -/* Check if we can reap the given task. This has to be called with stable - * tsk->mm - */ -void try_oom_reaper(struct task_struct *tsk) -{ - struct mm_struct *mm = tsk->mm; - struct task_struct *p; - - if (!mm) - return; - - /* - * There might be other threads/processes which are either not - * dying or even not killable. - */ - if (atomic_read(&mm->mm_users) > 1) { - rcu_read_lock(); - for_each_process(p) { - if (!process_shares_mm(p, mm)) - continue; - if (fatal_signal_pending(p)) - continue; - - /* - * If the task is exiting make sure the whole thread group - * is exiting and cannot acces mm anymore. - */ - if (signal_group_exit(p->signal)) - continue; - - /* Give up */ - rcu_read_unlock(); - return; - } - rcu_read_unlock(); - } - - wake_oom_reaper(tsk); -} - static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -653,10 +656,6 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#else -static void wake_oom_reaper(struct task_struct *tsk) -{ -} #endif /** @@ -733,13 +732,87 @@ void oom_killer_enable(void) oom_killer_disabled = false; } +static inline bool __task_will_free_mem(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + + /* + * A coredumping process may sleep for an extended period in exit_mm(), + * so the oom killer cannot assume that the process will promptly exit + * and release memory. + */ + if (sig->flags & SIGNAL_GROUP_COREDUMP) + return false; + + if (sig->flags & SIGNAL_GROUP_EXIT) + return true; + + if (thread_group_empty(task) && (task->flags & PF_EXITING)) + return true; + + return false; +} + +/* + * Checks whether the given task is dying or exiting and likely to + * release its address space. This means that all threads and processes + * sharing the same mm have to be killed or exiting. + * Caller has to make sure that task->mm is stable (hold task_lock or + * it operates on the current). + */ +bool task_will_free_mem(struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct task_struct *p; + bool ret; + + /* + * Skip tasks without mm because it might have passed its exit_mm and + * exit_oom_victim. oom_reaper could have rescued that but do not rely + * on that for now. We can consider find_lock_task_mm in future. + */ + if (!mm) + return false; + + if (!__task_will_free_mem(task)) + return false; + + /* + * This task has already been drained by the oom reaper so there are + * only small chances it will free some more + */ + if (test_bit(MMF_OOM_REAPED, &mm->flags)) + return false; + + if (atomic_read(&mm->mm_users) <= 1) + return true; + + /* + * This is really pessimistic but we do not have any reliable way + * to check that external processes share with our mm + */ + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(task, p)) + continue; + ret = __task_will_free_mem(p); + if (!ret) + break; + } + rcu_read_unlock(); + + return ret; +} + /* * Must be called while holding a reference to p, which will be released upon * returning. */ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, const char *message) + const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -755,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * its children or threads, just set TIF_MEMDIE so it can die quickly */ task_lock(p); - if (p->mm && task_will_free_mem(p)) { + if (task_will_free_mem(p)) { mark_oom_victim(p); - try_oom_reaper(p); + wake_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -765,7 +838,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_unlock(p); if (__ratelimit(&oom_rs)) - dump_header(oc, p, memcg); + dump_header(oc, p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); @@ -786,8 +859,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, memcg, oc->nodemask, - totalpages); + child_points = oom_badness(child, + oc->memcg, oc->nodemask, totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -840,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || - p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { /* * We cannot use oom_reaper for the mm shared by this * process because it wouldn't get killed and so the - * memory might be still used. + * memory might be still used. Hide the mm from the oom + * killer to guarantee OOM forward progress. */ can_oom_reap = false; + set_bit(MMF_OOM_REAPED, &mm->flags); + pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", + task_pid_nr(victim), victim->comm, + task_pid_nr(p), p->comm); continue; } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); @@ -865,8 +942,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, - struct mem_cgroup *memcg) +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -882,7 +958,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) return; - dump_header(oc, NULL, memcg); + dump_header(oc, NULL); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -930,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc) * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. - * - * But don't select if current has already released its mm and cleared - * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. */ - if (current->mm && - (fatal_signal_pending(current) || task_will_free_mem(current))) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); return true; } @@ -957,13 +1029,13 @@ bool out_of_memory(struct oom_control *oc) constraint = constrained_alloc(oc, &totalpages); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; - check_panic_on_oom(oc, constraint, NULL); + check_panic_on_oom(oc, constraint); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, NULL, + oom_kill_process(oc, current, 0, totalpages, "Out of memory (oom_kill_allocating_task)"); return true; } @@ -971,12 +1043,11 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p && !is_sysrq_oom(oc)) { - dump_header(oc, NULL, NULL); + dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, NULL, - "Out of memory"); + oom_kill_process(oc, p, points, totalpages, "Out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. @@ -988,14 +1059,15 @@ bool out_of_memory(struct oom_control *oc) /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a - * parallel oom killing is already in progress so do nothing. + * memory-hogging task. If oom_lock is held by somebody else, a parallel oom + * killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = NULL, .gfp_mask = 0, .order = 0, }; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e2481949494c..f4cd7d8005c9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, */ /** - * zone_dirtyable_memory - number of dirtyable pages in a zone - * @zone: the zone + * node_dirtyable_memory - number of dirtyable pages in a node + * @pgdat: the node * - * Returns the zone's number of pages potentially available for dirty - * page cache. This is the base value for the per-zone dirty limits. + * Returns the node's number of pages potentially available for dirty + * page cache. This is the base value for the per-node dirty limits. */ -static unsigned long zone_dirtyable_memory(struct zone *zone) +static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { - unsigned long nr_pages; + unsigned long nr_pages = 0; + int z; + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + nr_pages += zone_page_state(zone, NR_FREE_PAGES); + } - nr_pages = zone_page_state(zone, NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ - nr_pages -= min(nr_pages, zone->totalreserve_pages); + nr_pages -= min(nr_pages, pgdat->totalreserve_pages); - nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); - nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } @@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) int i; for_each_node_state(node, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *z = &NODE_DATA(node)->node_zones[i]; + for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { + struct zone *z; + unsigned long nr_pages; - if (is_highmem(z)) - x += zone_dirtyable_memory(z); + if (!is_highmem_idx(i)) + continue; + + z = &NODE_DATA(node)->node_zones[i]; + if (!populated_zone(z)) + continue; + + nr_pages = zone_page_state(z, NR_FREE_PAGES); + /* watch for underflows */ + nr_pages -= min(nr_pages, high_wmark_pages(z)); + nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); + nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); + x += nr_pages; } } + /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below @@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void) */ x -= min(x, totalreserve_pages); - x += global_page_state(NR_INACTIVE_FILE); - x += global_page_state(NR_ACTIVE_FILE); + x += global_node_page_state(NR_INACTIVE_FILE); + x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) } /** - * zone_dirty_limit - maximum number of dirty pages allowed in a zone - * @zone: the zone + * node_dirty_limit - maximum number of dirty pages allowed in a node + * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a zone, based - * on the zone's dirtyable memory. + * Returns the maximum number of dirty pages allowed in a node, based + * on the node's dirtyable memory. */ -static unsigned long zone_dirty_limit(struct zone *zone) +static unsigned long node_dirty_limit(struct pglist_data *pgdat) { - unsigned long zone_memory = zone_dirtyable_memory(zone); + unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * - zone_memory / global_dirtyable_memory(); + node_memory / global_dirtyable_memory(); else - dirty = vm_dirty_ratio * zone_memory / 100; + dirty = vm_dirty_ratio * node_memory / 100; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; @@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone) } /** - * zone_dirty_ok - tells whether a zone is within its dirty limits - * @zone: the zone to check + * node_dirty_ok - tells whether a node is within its dirty limits + * @pgdat: the node to check * - * Returns %true when the dirty pages in @zone are within the zone's + * Returns %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ -bool zone_dirty_ok(struct zone *zone) +bool node_dirty_ok(struct pglist_data *pgdat) { - unsigned long limit = zone_dirty_limit(zone); + unsigned long limit = node_dirty_limit(pgdat); + unsigned long nr_pages = 0; - return zone_page_state(zone, NR_FILE_DIRTY) + - zone_page_state(zone, NR_UNSTABLE_NFS) + - zone_page_state(zone, NR_WRITEBACK) <= limit; + nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); + nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); + nr_pages += node_page_state(pgdat, NR_WRITEBACK); + + return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, @@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping, * written to the server's write cache, but has not yet * been flushed to permanent storage. */ - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); @@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) @@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) + if (global_node_page_state(NR_UNSTABLE_NFS) + + global_node_page_state(NR_WRITEBACK) <= dirty_thresh) break; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void laptop_mode_timer_fn(unsigned long data) { struct request_queue *q = (struct request_queue *)data; - int nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + int nr_pages = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); struct bdi_writeback *wb; /* @@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) wb = inode_to_wb(inode); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_DIRTIED); + __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); @@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, { if (mapping_cap_account_dirty(mapping)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); - dec_zone_page_state(page, NR_FILE_DIRTY); + dec_node_page_state(page, NR_FILE_DIRTY); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); } @@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page) wb = unlocked_inode_to_wb_begin(inode, &locked); current->nr_dirtied--; - dec_zone_page_state(page, NR_DIRTIED); + dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); unlocked_inode_to_wb_end(inode, locked); } @@ -2563,6 +2590,7 @@ int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); + page = compound_head(page); if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* @@ -2712,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page) wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); - dec_zone_page_state(page, NR_FILE_DIRTY); + dec_node_page_state(page, NR_FILE_DIRTY); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } @@ -2747,14 +2776,20 @@ int test_clear_page_writeback(struct page *page) __wb_writeout_inc(wb); } } + + if (mapping->host && !mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + sb_clear_inode_writeback(mapping->host); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } if (ret) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - dec_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); + dec_node_page_state(page, NR_WRITEBACK); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); + inc_node_page_state(page, NR_WRITTEN); } unlock_page_memcg(page); return ret; @@ -2774,11 +2809,24 @@ int __test_set_page_writeback(struct page *page, bool keep_write) spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { + bool on_wblist; + + on_wblist = mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK); + radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + + /* + * We can come through here when swapping anonymous + * pages, so we don't necessarily have an inode to track + * for sync. + */ + if (mapping->host && !on_wblist) + sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, @@ -2794,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } if (!ret) { mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); + inc_node_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6903b695ebae..ea759b935360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include <linux/sched/rt.h> #include <linux/page_owner.h> #include <linux/kthread.h> +#include <linux/memcontrol.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -286,15 +287,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) - return true; - - return false; -} + int nid = early_pfn_to_nid(pfn); -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; @@ -339,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - return false; -} - static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -1004,6 +994,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + if (compound) + ClearPageDoubleMap(page); for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -1014,8 +1006,12 @@ static __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageAnonHead(page)) + if (PageMappingFlags(page)) page->mapping = NULL; + if (memcg_kmem_enabled() && PageKmemcg(page)) { + memcg_kmem_uncharge(page, order); + __ClearPageKmemcg(page); + } if (check_free) bad += free_pages_check(page); if (bad) @@ -1082,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1139,9 +1135,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -1273,7 +1269,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; @@ -1722,6 +1718,19 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +inline void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); + set_page_owner(page, order, gfp_flags); +} + static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { @@ -1734,13 +1743,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags poisoned &= page_is_poisoned(p); } - set_page_private(page, 0); - set_page_refcounted(page); - - arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); - kasan_alloc_pages(page, order); + post_alloc_hook(page, order, gfp_flags); if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) @@ -1749,8 +1752,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - set_page_owner(page, order, gfp_flags); - /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking @@ -2459,7 +2460,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; - gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -2473,12 +2473,9 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - gfp_mask = get_page_owner_gfp(page); - set_page_owner(page, 0, gfp_mask); - for (i = 1; i < (1 << order); i++) { + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - set_page_owner(page + i, 0, gfp_mask); - } + split_page_owner(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2507,9 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - set_page_owner(page, order, __GFP_MOVABLE); - - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2525,33 +2523,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. - */ -int split_free_page(struct page *page) -{ - unsigned int order; - int nr_pages; - - order = page_order(page); - - nr_pages = __isolate_free_page(page, order); - if (!nr_pages) - return 0; - - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); - return nr_pages; -} - -/* * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. @@ -2616,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, else page = list_first_entry(list, struct page, lru); - __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; @@ -2642,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) - set_bit(ZONE_FAIR_DEPLETED, &zone->flags); - - __count_zone_vm_events(PGALLOC, zone, 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -2861,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return local_zone->node == zone->node; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return true; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ -static void reset_alloc_batches(struct zone *preferred_zone) -{ - struct zone *zone = preferred_zone->zone_pgdat->node_zones; - - do { - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); - } while (zone++ != preferred_zone); -} - /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -2905,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - bool fair_skipped = false; - bool apply_fair = (alloc_flags & ALLOC_FAIR); + struct pglist_data *last_pgdat_dirty_limit = NULL; -zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2923,50 +2864,33 @@ zonelist_scan: !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* - * Distribute pages in proportion to the individual - * zone size to ensure fair page aging. The zone a - * page was allocated in should have no effect on the - * time the page has in memory before being reclaimed. - */ - if (apply_fair) { - if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - fair_skipped = true; - continue; - } - if (!zone_local(ac->preferred_zoneref->zone, zone)) { - if (fair_skipped) - goto reset_fair; - apply_fair = false; - } - } - /* * When allocating a page cache page for writing, we - * want to get it from a zone that is within its dirty - * limit, such that no single zone holds more than its + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. - * The dirty limits take into account the zone's + * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * - * This may look like it could increase pressure on - * lower zones by failing allocations in higher zones - * before they are full. But the pages that do spill - * over are limited as the lower zones are protected - * by this very same mechanism. It should not become - * a practical burden to them. - * * XXX: For now, allow allocations to potentially - * exceed the per-zone dirty limit in the slowpath + * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed - * zones are together not big enough to reach the + * nodes are together not big enough to reach the * global limit. The proper fix for these situations - * will require awareness of zones in the + * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ - if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) - continue; + if (ac->spread_dirty_pages) { + if (last_pgdat_dirty_limit == zone->zone_pgdat) + continue; + + if (!node_dirty_ok(zone->zone_pgdat)) { + last_pgdat_dirty_limit = zone->zone_pgdat; + continue; + } + } mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, @@ -2978,16 +2902,16 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || + if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: @@ -3017,23 +2941,6 @@ try_this_zone: } } - /* - * The first pass makes sure allocations are spread fairly within the - * local node. However, the local node might have free pages left - * after the fairness batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without fairness, and - * include remote zones now, before entering the slowpath and waking - * kswapd: prefer spilling to a remote zone over swapping locally. - */ - if (fair_skipped) { -reset_fair: - apply_fair = false; - fair_skipped = false; - reset_alloc_batches(ac->preferred_zoneref->zone); - z = ac->preferred_zoneref; - goto zonelist_scan; - } - return NULL; } @@ -3103,6 +3010,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct oom_control oc = { .zonelist = ac->zonelist, .nodemask = ac->nodemask, + .memcg = NULL, .gfp_mask = gfp_mask, .order = order, }; @@ -3177,7 +3085,6 @@ out: return page; } - /* * Maximum number of compaction retries wit a progress before OOM * killer is consider as the only way to move forward. @@ -3189,17 +3096,16 @@ out: static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { struct page *page; - int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, &contended_compaction); + prio); current->flags &= ~PF_MEMALLOC; if (*compact_result <= COMPACT_INACTIVE) @@ -3211,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3229,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - *compact_result = COMPACT_CONTENDED; - - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) - *compact_result = COMPACT_CONTENDED; - cond_resched(); return NULL; @@ -3254,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, enum migrate_mode *migrate_mode, + enum compact_result compact_result, + enum compact_priority *compact_priority, int compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; @@ -3265,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the - * failure could be caused by weak migration mode. + * failure could be caused by insufficient priority */ if (compaction_failed(compact_result)) { - if (*migrate_mode == MIGRATE_ASYNC) { - *migrate_mode = MIGRATE_SYNC_LIGHT; + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; return true; } return false; @@ -3303,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { *compact_result = COMPACT_SKIPPED; return NULL; @@ -3312,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, - enum migrate_mode *migrate_mode, + enum compact_priority *compact_priority, int compaction_retries) { struct zone *zone; @@ -3380,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -3402,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; + pg_data_t *last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, ac_classzone_idx(ac)); + ac->high_zoneidx, ac->nodemask) { + if (last_pgdat != zone->zone_pgdat) + wakeup_kswapd(zone, order, ac->high_zoneidx); + last_pgdat = zone->zone_pgdat; + } } static inline unsigned int @@ -3439,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (gfp_mask & __GFP_MEMALLOC) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - alloc_flags |= ALLOC_NO_WATERMARKS; - } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3458,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); -} + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return false; -static inline bool is_thp_gfp_mask(gfp_t gfp_mask) -{ - return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + if (gfp_mask & __GFP_MEMALLOC) + return true; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return true; + if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + return true; + + return false; } /* @@ -3499,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; /* - * Keep reclaiming pages while there is a chance this will lead somewhere. - * If none of the target zones can satisfy our allocation request even - * if all reclaimable pages are considered then we are screwed and have - * to go OOM. + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { @@ -3527,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * prevent from pre mature OOM */ if (!did_some_progress) { - unsigned long writeback; - unsigned long dirty; + unsigned long write_pending; - writeback = zone_page_state_snapshot(zone, - NR_WRITEBACK); - dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + write_pending = zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); - if (2*(writeback + dirty) > reclaimable) { + if (2 * write_pending > reclaimable) { congestion_wait(BLK_RW_ASYNC, HZ/10); return true; } @@ -3569,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum migrate_mode migration_mode = MIGRATE_ASYNC; + enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; @@ -3593,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry: + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* - * OK, we're below the kswapd watermark and have kicked background - * reclaim. Now things get more complex, so set up alloc_flags according - * to how we want to proceed. + * The adjusted alloc_flags might result in immediate success, so try + * that first */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. Don't try + * that for allocations that are allowed to ignore watermarks, as the + * ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && + !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes THP page fault allocations + */ + if (gfp_mask & __GFP_NORETRY) { + /* + * If compaction is deferred for high-order allocations, + * it is because sync compaction recently failed. If + * this is the case and the caller requested a THP + * allocation, we do not want to heavily disrupt the + * system, so we fail the allocation instead of entering + * direct reclaim. + */ + if (compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + + if (gfp_pfmemalloc_allowed(gfp_mask)) + alloc_flags = ALLOC_NO_WATERMARKS; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { + if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } - /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; - /* Allocate without watermarks if the context allows */ - if (alloc_flags & ALLOC_NO_WATERMARKS) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; - } - /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* @@ -3658,38 +3590,6 @@ retry: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* - * Try direct compaction. The first pass is asynchronous. Subsequent - * attempts after direct reclaim are synchronous - */ - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - migration_mode, - &compact_result); - if (page) - goto got_pg; - - /* Checks for THP-specific high-order allocations */ - if (is_thp_gfp_mask(gfp_mask)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; - - /* - * Compaction is contended so rather back off than cause - * excessive stalls. - */ - if(compact_result == COMPACT_CONTENDED) - goto nopage; - } - - if (order && compaction_made_progress(compact_result)) - compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3697,16 +3597,25 @@ retry: if (page) goto got_pg; + /* Try direct compaction and then allocating */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + compact_priority, &compact_result); + if (page) + goto got_pg; + + if (order && compaction_made_progress(compact_result)) + compaction_retries++; + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) - goto noretry; + goto nopage; /* * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) - goto noretry; + goto nopage; /* * Costly allocations might have made a progress but this doesn't mean @@ -3730,7 +3639,7 @@ retry: */ if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, - compact_result, &migration_mode, + compact_result, &compact_priority, compaction_retries)) goto retry; @@ -3745,25 +3654,6 @@ retry: goto retry; } -noretry: - /* - * High-order allocations do not necessarily loop after direct reclaim - * and reclaim/compaction depends on compaction being called after - * reclaim so call directly if necessary. - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. All other requests should tolerate - * at least light sync migration. - */ - if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_ASYNC; - else - migration_mode = MIGRATE_SYNC_LIGHT; - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, - ac, migration_mode, - &compact_result); - if (page) - goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -3779,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { struct page *page; unsigned int cpuset_mems_cookie; - unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), @@ -3866,6 +3756,14 @@ no_zone: } out: + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { + if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { + __free_pages(page, order); + page = NULL; + } else + __SetPageKmemcg(page); + } + if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); @@ -4021,56 +3919,6 @@ void __free_page_frag(void *addr) } EXPORT_SYMBOL(__free_page_frag); -/* - * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is - * equivalent to alloc_pages. - * - * It should be used when the caller would like to use kmalloc, but since the - * allocation is large, it has to fall back to the page allocator. - */ -struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages(gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp_mask, order); - if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { - __free_pages(page, order); - page = NULL; - } - return page; -} - -/* - * __free_kmem_pages and free_kmem_pages will free pages allocated with - * alloc_kmem_pages. - */ -void __free_kmem_pages(struct page *page, unsigned int order) -{ - memcg_kmem_uncharge(page, order); - __free_pages(page, order); -} - -void free_kmem_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_kmem_pages(virt_to_page((void *)addr), order); - } -} - static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { @@ -4252,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = global_page_state(NR_SHMEM); + val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -4274,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; - val->sharedram = node_page_state(nid, NR_SHMEM); - val->freeram = node_page_state(nid, NR_FREE_PAGES); + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; @@ -4358,6 +4206,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4373,26 +4222,73 @@ void show_free_areas(unsigned int filter) " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), - global_page_state(NR_FILE_MAPPED), - global_page_state(NR_SHMEM), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " unstable:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif + K(node_page_state(pgdat, NR_SHMEM)), + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_PAGES_SCANNED), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -4414,61 +4310,41 @@ void show_free_areas(unsigned int filter) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" + " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " mapped:%lukB" - " shmem:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" - " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" - " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_FILE_DIRTY)), - K(zone_page_state(zone, NR_WRITEBACK)), - K(zone_page_state(zone, NR_FILE_MAPPED)), - K(zone_page_state(zone, NR_SHMEM)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), - K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES)), - K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -4510,7 +4386,7 @@ void show_free_areas(unsigned int filter) hugetlb_show_meminfo(); - printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } @@ -5381,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone) zone->pageset = alloc_percpu(struct per_cpu_pageset); for_each_possible_cpu(cpu) zone_pageset_init(zone, cpu); + + if (!zone->zone_pgdat->per_cpu_nodestats) { + zone->zone_pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); + } } /* @@ -5950,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); + spin_lock_init(&pgdat->lru_lock); + lruvec_init(node_lruvec(pgdat)); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5999,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) + pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; + pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; + zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); - zone->zone_pgdat = pgdat; zone_pcp_init(zone); - /* For bootup, initialized properly in watermark setup */ - mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - - lruvec_init(&zone->lruvec); if (!size) continue; @@ -6079,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, @@ -6465,15 +6344,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; @@ -6737,6 +6619,9 @@ static void calculate_totalreserve_pages(void) enum zone_type i, j; for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; @@ -6753,7 +6638,7 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; - zone->totalreserve_pages = max; + pgdat->totalreserve_pages += max; reserve_pages += max; } @@ -6854,10 +6739,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - spin_unlock_irqrestore(&zone->lock, flags); } @@ -6968,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6975,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -6984,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6991,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } diff --git a/mm/page_idle.c b/mm/page_idle.c index 4ea9c4ef5146..ae11aa914e55 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn) return NULL; zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); return page; } diff --git a/mm/page_io.c b/mm/page_io.c index 242dba07545b..fb1fa269d3a0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; + cond_resched(); + first_block = bmap(inode, probe_block); if (first_block == 0) goto bad_bmap; @@ -259,7 +261,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { struct bio *bio; - int ret, rw = WRITE; + int ret; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { @@ -317,12 +319,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC; + bio->bi_rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); - submit_bio(rw, bio); + submit_bio(bio); out: return ret; } @@ -369,8 +372,9 @@ int swap_readpage(struct page *page) ret = -ENOMEM; goto out; } + bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); - submit_bio(READ, bio); + submit_bio(bio); out: return ret; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 612122bf6a42..064b7fb6e0b5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -7,6 +7,7 @@ #include <linux/pageblock-flags.h> #include <linux/memory.h> #include <linux/hugetlb.h> +#include <linux/page_owner.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -80,7 +81,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; - struct page *isolated_page = NULL; + bool isolated_page = false; unsigned int order; unsigned long page_idx, buddy_idx; struct page *buddy; @@ -108,9 +109,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (pfn_valid_within(page_to_pfn(buddy)) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); - kernel_map_pages(page, (1 << order), 1); - set_page_refcounted(page); - isolated_page = page; + isolated_page = true; } } } @@ -128,8 +127,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); - if (isolated_page) - __free_pages(isolated_page, order); + if (isolated_page) { + post_alloc_hook(page, order, __GFP_MOVABLE); + __free_pages(page, order); + } } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c index fedeba88c9cb..ec6dc1886f71 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -7,11 +7,22 @@ #include <linux/page_owner.h> #include <linux/jump_label.h> #include <linux/migrate.h> +#include <linux/stackdepot.h> + #include "internal.h" +/* + * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) + * to use off stack temporal storage + */ +#define PAGE_OWNER_STACK_DEPTH (16) + static bool page_owner_disabled = true; DEFINE_STATIC_KEY_FALSE(page_owner_inited); +static depot_stack_handle_t dummy_handle; +static depot_stack_handle_t failure_handle; + static void init_early_allocated_pages(void); static int early_page_owner_param(char *buf) @@ -34,11 +45,41 @@ static bool need_page_owner(void) return true; } +static noinline void register_dummy_stack(void) +{ + unsigned long entries[4]; + struct stack_trace dummy; + + dummy.nr_entries = 0; + dummy.max_entries = ARRAY_SIZE(entries); + dummy.entries = &entries[0]; + dummy.skip = 0; + + save_stack_trace(&dummy); + dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + struct stack_trace failure; + + failure.nr_entries = 0; + failure.max_entries = ARRAY_SIZE(entries); + failure.entries = &entries[0]; + failure.skip = 0; + + save_stack_trace(&failure); + failure_handle = depot_save_stack(&failure, GFP_KERNEL); +} + static void init_page_owner(void) { if (page_owner_disabled) return; + register_dummy_stack(); + register_failure_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -61,25 +102,66 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +static inline bool check_recursive_alloc(struct stack_trace *trace, + unsigned long ip) { - struct page_ext *page_ext = lookup_page_ext(page); + int i, count; + + if (!trace->nr_entries) + return false; + + for (i = 0, count = 0; i < trace->nr_entries; i++) { + if (trace->entries[i] == ip && ++count == 2) + return true; + } + + return false; +} +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, - .max_entries = ARRAY_SIZE(page_ext->trace_entries), - .entries = &page_ext->trace_entries[0], - .skip = 3, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; + depot_stack_handle_t handle; + + save_stack_trace(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + /* + * We need to check recursion here because our request to stackdepot + * could trigger memory allocation to save new entry. New memory + * allocation would reach here and call depot_save_stack() again + * if we don't catch it. There is still not enough memory in stackdepot + * so it would try to allocate memory again and loop forever. + */ + if (check_recursive_alloc(&trace, _RET_IP_)) + return dummy_handle; + + handle = depot_save_stack(&trace, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; - save_stack_trace(&trace); - + page_ext->handle = save_stack(gfp_mask); page_ext->order = order; page_ext->gfp_mask = gfp_mask; - page_ext->nr_entries = trace.nr_entries; page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); @@ -94,34 +176,31 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext->last_migrate_reason = reason; } -gfp_t __get_page_owner_gfp(struct page *page) +void __split_page_owner(struct page *page, unsigned int order) { + int i; struct page_ext *page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) - /* - * The caller just returns 0 if no valid gfp - * So return 0 here too. - */ - return 0; + return; - return page_ext->gfp_mask; + page_ext->order = 0; + for (i = 1; i < (1 << order); i++) + __copy_page_owner(page, page + i); } void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); - int i; if (unlikely(!old_ext || !new_ext)) return; new_ext->order = old_ext->order; new_ext->gfp_mask = old_ext->gfp_mask; - new_ext->nr_entries = old_ext->nr_entries; - - for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) - new_ext->trace_entries[i] = old_ext->trace_entries[i]; + new_ext->last_migrate_reason = old_ext->last_migrate_reason; + new_ext->handle = old_ext->handle; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -137,14 +216,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext) + struct page *page, struct page_ext *page_ext, + depot_stack_handle_t handle) { int ret; int pageblock_mt, page_mt; char *kbuf; + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { - .nr_entries = page_ext->nr_entries, - .entries = &page_ext->trace_entries[0], + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; kbuf = kmalloc(count, GFP_KERNEL); @@ -173,6 +256,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + depot_fetch_stack(handle, &trace); ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; @@ -203,10 +287,14 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { - .nr_entries = page_ext->nr_entries, - .entries = &page_ext->trace_entries[0], + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; + depot_stack_handle_t handle; gfp_t gfp_mask; int mt; @@ -222,6 +310,13 @@ void __dump_page_owner(struct page *page) return; } + handle = READ_ONCE(page_ext->handle); + if (!handle) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + depot_fetch_stack(handle, &trace); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); @@ -237,6 +332,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long pfn; struct page *page; struct page_ext *page_ext; + depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; @@ -285,10 +381,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Access to page_ext->handle isn't synchronous so we should + * be careful to access it. + */ + handle = READ_ONCE(page_ext->handle); + if (!handle) + continue; + /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; - return print_page_owner(buf, count, pfn, page, page_ext); + return print_page_owner(buf, count, pfn, page, + page_ext, handle); } return 0; diff --git a/mm/readahead.c b/mm/readahead.c index 40be3ae0afe3..65ec288dc057 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -89,7 +89,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { + readahead_gfp_mask(mapping))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -108,7 +108,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned int nr_pages, gfp_t gfp) { struct blk_plug plug; unsigned page_idx; @@ -126,10 +126,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = lru_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) { + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) mapping->a_ops->readpage(filp, page); - } put_page(page); } ret = 0; @@ -159,6 +157,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, int page_idx; int ret = 0; loff_t isize = i_size_read(inode); + gfp_t gfp_mask = readahead_gfp_mask(mapping); if (isize == 0) goto out; @@ -180,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (page && !radix_tree_exceptional_entry(page)) continue; - page = page_cache_alloc_readahead(mapping); + page = __page_cache_alloc(gfp_mask); if (!page) break; page->index = page_offset; @@ -196,7 +195,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, * will then handle the error. */ if (ret) - read_pages(mapping, filp, &page_pool, ret); + read_pages(mapping, filp, &page_pool, ret, gfp_mask); BUG_ON(!list_empty(&page_pool)); out: return ret; diff --git a/mm/rmap.c b/mm/rmap.c index 0ea5d9071b32..709bc83703b1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -27,7 +27,7 @@ * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * zone_lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) @@ -1084,23 +1084,20 @@ EXPORT_SYMBOL_GPL(page_mkclean); * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma * @vma: the vma the page belongs to - * @address: the user virtual address mapped * * When a page belongs exclusively to one process after a COW event, * that page can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling * processes. */ -void page_move_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + page = compound_head(page); + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_VMA(!anon_vma, vma); - if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) - address &= HPAGE_PMD_MASK; - VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; /* @@ -1215,11 +1212,9 @@ void do_page_add_anon_rmap(struct page *page, * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (compound) { - __inc_zone_page_state(page, - NR_ANON_TRANSPARENT_HUGEPAGES); - } - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); + if (compound) + __inc_node_page_state(page, NR_ANON_THPS); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); } if (unlikely(PageKsm(page))) return; @@ -1256,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __inc_node_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1273,18 +1268,42 @@ void page_add_new_anon_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page) +void page_add_file_rmap(struct page *page, bool compound) { + int i, nr = 1; + + VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (atomic_inc_and_test(&page->_mapcount)) { - __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + if (compound && PageTransHuge(page)) { + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_inc_and_test(&page[i]._mapcount)) + nr++; + } + if (!atomic_inc_and_test(compound_mapcount_ptr(page))) + goto out; + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + } else { + if (PageTransCompound(page)) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + SetPageDoubleMap(compound_head(page)); + if (PageMlocked(page)) + clear_page_mlock(compound_head(page)); + } + if (!atomic_inc_and_test(&page->_mapcount)) + goto out; } + __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); +out: unlock_page_memcg(page); } -static void page_remove_file_rmap(struct page *page) +static void page_remove_file_rmap(struct page *page, bool compound) { + int i, nr = 1; + + VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ @@ -1295,15 +1314,26 @@ static void page_remove_file_rmap(struct page *page) } /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + if (compound && PageTransHuge(page)) { + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + goto out; + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + } else { + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + } /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * We use the irq-unsafe __{inc|mod}_zone_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_zone_page_state(page, NR_FILE_MAPPED); + __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) @@ -1326,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __dec_node_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1345,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) { - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); deferred_split_huge_page(page); } } @@ -1359,11 +1389,8 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) { - VM_BUG_ON_PAGE(compound && !PageHuge(page), page); - page_remove_file_rmap(page); - return; - } + if (!PageAnon(page)) + return page_remove_file_rmap(page, compound); if (compound) return page_remove_anon_compound_rmap(page); @@ -1377,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound) * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_zone_page_state(page, NR_ANON_PAGES); + __dec_node_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1427,7 +1454,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, goto out; } - pte = page_check_address(page, mm, address, &ptl, 0); + pte = page_check_address(page, mm, address, &ptl, + PageTransCompound(page)); if (!pte) goto out; @@ -1438,8 +1466,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { - /* Holding pte lock, we do *not* need mmap_sem here */ - mlock_vma_page(page); + /* PTE-mapped THP are never mlocked */ + if (!PageTransCompound(page)) { + /* + * Holding pte lock, we do *not* need + * mmap_sem here + */ + mlock_vma_page(page); + } ret = SWAP_MLOCK; goto out_unmap; } diff --git a/mm/shmem.c b/mm/shmem.c index 24463b67b6ef..2ac19a61d565 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/swap.h> #include <linux/uio.h> +#include <linux/khugepaged.h> static struct vfsmount *shm_mnt; @@ -97,14 +98,6 @@ struct shmem_falloc { pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; -/* Flag allocation requirements to shmem_getpage */ -enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ - SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ - SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ -}; - #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -124,7 +117,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); -static inline int shmem_getpage(struct inode *inode, pgoff_t index, +int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, @@ -173,10 +166,13 @@ static inline int shmem_reacct_size(unsigned long flags, * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags) +static inline int shmem_acct_block(unsigned long flags, long pages) { - return (flags & VM_NORESERVE) ? - security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; + if (!(flags & VM_NORESERVE)) + return 0; + + return security_vm_enough_memory_mm(current->mm, + pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) @@ -192,6 +188,7 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static struct file_system_type shmem_fs_type; static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -249,6 +246,53 @@ static void shmem_recalc_inode(struct inode *inode) } } +bool shmem_charge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + + if (shmem_acct_block(info->flags, pages)) + return false; + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + inode->i_mapping->nrpages += pages; + + if (!sbinfo->max_blocks) + return true; + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) { + inode->i_mapping->nrpages -= pages; + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + return false; + } + percpu_counter_add(&sbinfo->used_blocks, pages); + return true; +} + +void shmem_uncharge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + inode->i_blocks -= pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); +} + /* * Replace item expected in radix tree by a new item, while holding tree lock. */ @@ -289,36 +333,256 @@ static bool shmem_confirm_swap(struct address_space *mapping, } /* + * Definitions for "huge tmpfs": tmpfs mounted with the huge= option + * + * SHMEM_HUGE_NEVER: + * disables huge pages for the mount; + * SHMEM_HUGE_ALWAYS: + * enables huge pages for the mount; + * SHMEM_HUGE_WITHIN_SIZE: + * only allocate huge pages if the page will be fully within i_size, + * also respect fadvise()/madvise() hints; + * SHMEM_HUGE_ADVISE: + * only allocate huge pages if requested with fadvise()/madvise(); + */ + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +/* ifdef here to avoid bloating shmem.o when not necessary */ + +int shmem_huge __read_mostly; + +static int shmem_parse_huge(const char *str) +{ + if (!strcmp(str, "never")) + return SHMEM_HUGE_NEVER; + if (!strcmp(str, "always")) + return SHMEM_HUGE_ALWAYS; + if (!strcmp(str, "within_size")) + return SHMEM_HUGE_WITHIN_SIZE; + if (!strcmp(str, "advise")) + return SHMEM_HUGE_ADVISE; + if (!strcmp(str, "deny")) + return SHMEM_HUGE_DENY; + if (!strcmp(str, "force")) + return SHMEM_HUGE_FORCE; + return -EINVAL; +} + +static const char *shmem_format_huge(int huge) +{ + switch (huge) { + case SHMEM_HUGE_NEVER: + return "never"; + case SHMEM_HUGE_ALWAYS: + return "always"; + case SHMEM_HUGE_WITHIN_SIZE: + return "within_size"; + case SHMEM_HUGE_ADVISE: + return "advise"; + case SHMEM_HUGE_DENY: + return "deny"; + case SHMEM_HUGE_FORCE: + return "force"; + default: + VM_BUG_ON(1); + return "bad_val"; + } +} + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + LIST_HEAD(list), *pos, *next; + struct inode *inode; + struct shmem_inode_info *info; + struct page *page; + unsigned long batch = sc ? sc->nr_to_scan : 128; + int removed = 0, split = 0; + + if (list_empty(&sbinfo->shrinklist)) + return SHRINK_STOP; + + spin_lock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &sbinfo->shrinklist) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + + /* pin the inode */ + inode = igrab(&info->vfs_inode); + + /* inode is about to be evicted */ + if (!inode) { + list_del_init(&info->shrinklist); + removed++; + goto next; + } + + /* Check if there's anything to gain */ + if (round_up(inode->i_size, PAGE_SIZE) == + round_up(inode->i_size, HPAGE_PMD_SIZE)) { + list_del_init(&info->shrinklist); + removed++; + iput(inode); + goto next; + } + + list_move(&info->shrinklist, &list); +next: + if (!--batch) + break; + } + spin_unlock(&sbinfo->shrinklist_lock); + + list_for_each_safe(pos, next, &list) { + int ret; + + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + + if (nr_to_split && split >= nr_to_split) { + iput(inode); + continue; + } + + page = find_lock_page(inode->i_mapping, + (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); + if (!page) + goto drop; + + if (!PageTransHuge(page)) { + unlock_page(page); + put_page(page); + goto drop; + } + + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + + if (ret) { + /* split failed: leave it on the list */ + iput(inode); + continue; + } + + split++; +drop: + list_del_init(&info->shrinklist); + removed++; + iput(inode); + } + + spin_lock(&sbinfo->shrinklist_lock); + list_splice_tail(&list, &sbinfo->shrinklist); + sbinfo->shrinklist_len -= removed; + spin_unlock(&sbinfo->shrinklist_lock); + + return split; +} + +static long shmem_unused_huge_scan(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (!READ_ONCE(sbinfo->shrinklist_len)) + return SHRINK_STOP; + + return shmem_unused_huge_shrink(sbinfo, sc, 0); +} + +static long shmem_unused_huge_count(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + return READ_ONCE(sbinfo->shrinklist_len); +} +#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ + +#define shmem_huge SHMEM_HUGE_DENY + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, void *expected) { - int error; + int error, nr = hpage_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(index != round_down(index, nr), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON(expected && PageTransHuge(page)); - get_page(page); + page_ref_add(page, nr); page->mapping = mapping; page->index = index; spin_lock_irq(&mapping->tree_lock); - if (!expected) + if (PageTransHuge(page)) { + void __rcu **results; + pgoff_t idx; + int i; + + error = 0; + if (radix_tree_gang_lookup_slot(&mapping->page_tree, + &results, &idx, index, 1) && + idx < index + HPAGE_PMD_NR) { + error = -EEXIST; + } + + if (!error) { + for (i = 0; i < HPAGE_PMD_NR; i++) { + error = radix_tree_insert(&mapping->page_tree, + index + i, page + i); + VM_BUG_ON(error); + } + count_vm_event(THP_FILE_ALLOC); + } + } else if (!expected) { error = radix_tree_insert(&mapping->page_tree, index, page); - else + } else { error = shmem_radix_tree_replace(mapping, index, expected, page); + } + if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); + mapping->nrpages += nr; + if (PageTransHuge(page)) + __inc_node_page_state(page, NR_SHMEM_THPS); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; spin_unlock_irq(&mapping->tree_lock); - put_page(page); + page_ref_sub(page, nr); } return error; } @@ -331,12 +595,14 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) struct address_space *mapping = page->mapping; int error; + VM_BUG_ON_PAGE(PageCompound(page), page); + spin_lock_irq(&mapping->tree_lock); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); - __dec_zone_page_state(page, NR_SHMEM); + __dec_node_page_state(page, NR_FILE_PAGES); + __dec_node_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); put_page(page); BUG_ON(error); @@ -510,10 +776,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } + VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); + if (!trylock_page(page)) continue; + + if (PageTransTail(page)) { + /* Middle of THP: zero out the page */ + clear_highpage(page); + unlock_page(page); + continue; + } else if (PageTransHuge(page)) { + if (index == round_down(end, HPAGE_PMD_NR)) { + /* + * Range ends in the middle of THP: + * zero out the page + */ + clear_highpage(page); + unlock_page(page); + continue; + } + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + } + if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (page_mapping(page) == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } @@ -589,8 +878,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } lock_page(page); + + if (PageTransTail(page)) { + /* Middle of THP: zero out the page */ + clear_highpage(page); + unlock_page(page); + /* + * Partial thp truncate due 'start' in middle + * of THP: don't need to look on these pages + * again on !pvec.nr restart. + */ + if (index != round_down(end, HPAGE_PMD_NR)) + start++; + continue; + } else if (PageTransHuge(page)) { + if (index == round_down(end, HPAGE_PMD_NR)) { + /* + * Range ends in the middle of THP: + * zero out the page + */ + clear_highpage(page); + unlock_page(page); + continue; + } + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + } + if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageTail(page), page); + if (page_mapping(page) == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); } else { @@ -607,10 +924,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index++; } - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) @@ -627,9 +944,9 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); return 0; @@ -639,6 +956,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = inode_change_ok(inode, attr); @@ -674,6 +992,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + + /* + * Part of the huge page can be beyond i_size: subject + * to shrink under memory pressure. + */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + spin_lock(&sbinfo->shrinklist_lock); + if (list_empty(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } } } @@ -686,11 +1018,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); + if (!list_empty(&info->shrinklist)) { + spin_lock(&sbinfo->shrinklist_lock); + if (!list_empty(&info->shrinklist)) { + list_del_init(&info->shrinklist); + sbinfo->shrinklist_len--; + } + spin_unlock(&sbinfo->shrinklist_lock); + } if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -773,9 +1114,9 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, delete_from_swap_cache(*pagep); set_page_dirty(*pagep); if (!error) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped--; - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); swap_free(swap); } } @@ -848,6 +1189,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; + VM_BUG_ON_PAGE(PageCompound(page), page); BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; @@ -922,10 +1264,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); @@ -984,24 +1326,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #define vm_policy vm_private_data #endif +static void shmem_pseudo_vma_init(struct vm_area_struct *vma, + struct shmem_inode_info *info, pgoff_t index) +{ + /* Create a pseudo vma that just contains the policy */ + vma->vm_start = 0; + /* Bias interleave by inode number to distribute better across nodes */ + vma->vm_pgoff = index + info->vfs_inode.i_ino; + vma->vm_ops = NULL; + vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); +} + +static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) +{ + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(vma->vm_policy); +} + static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct page *page; - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - + shmem_pseudo_vma_init(&pvma, info, index); page = swapin_readahead(swap, gfp, &pvma, 0); + shmem_pseudo_vma_destroy(&pvma); - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); + return page; +} +static struct page *shmem_alloc_hugepage(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR); + void __rcu **results; + struct page *page; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return NULL; + + rcu_read_lock(); + if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + hindex, 1) && idx < hindex + HPAGE_PMD_NR) { + rcu_read_unlock(); + return NULL; + } + rcu_read_unlock(); + + shmem_pseudo_vma_init(&pvma, info, hindex); + page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + shmem_pseudo_vma_destroy(&pvma); + if (page) + prep_transhuge_page(page); return page; } @@ -1011,23 +1392,51 @@ static struct page *shmem_alloc_page(gfp_t gfp, struct vm_area_struct pvma; struct page *page; - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + shmem_pseudo_vma_init(&pvma, info, index); + page = alloc_page_vma(gfp, &pvma, 0); + shmem_pseudo_vma_destroy(&pvma); - page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + return page; +} + +static struct page *shmem_alloc_and_acct_page(gfp_t gfp, + struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, + pgoff_t index, bool huge) +{ + struct page *page; + int nr; + int err = -ENOSPC; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + huge = false; + nr = huge ? HPAGE_PMD_NR : 1; + + if (shmem_acct_block(info->flags, nr)) + goto failed; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - nr) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, nr); + } + + if (huge) + page = shmem_alloc_hugepage(gfp, info, index); + else + page = shmem_alloc_page(gfp, info, index); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); + return page; } - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); - - return page; + err = -ENOMEM; + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -nr); +unacct: + shmem_unacct_blocks(info->flags, nr); +failed: + return ERR_PTR(err); } /* @@ -1084,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + __inc_node_page_state(newpage, NR_FILE_PAGES); + __dec_node_page_state(oldpage, NR_FILE_PAGES); } spin_unlock_irq(&swap_mapping->tree_lock); @@ -1132,12 +1541,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; + enum sgp_type sgp_huge = sgp; + pgoff_t hindex = index; int error; int once = 0; int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; + if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) + sgp = SGP_CACHE; repeat: swap.val = 0; page = find_lock_entry(mapping, index); @@ -1240,10 +1653,10 @@ repeat: mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); if (sgp == SGP_WRITE) mark_page_accessed(page); @@ -1253,51 +1666,111 @@ repeat: swap_free(swap); } else { - if (shmem_acct_block(info->flags)) { - error = -ENOSPC; - goto failed; + /* shmem_symlink() */ + if (mapping->a_ops != &shmem_aops) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + goto alloc_nohuge; + if (shmem_huge == SHMEM_HUGE_FORCE) + goto alloc_huge; + switch (sbinfo->huge) { + loff_t i_size; + pgoff_t off; + case SHMEM_HUGE_NEVER: + goto alloc_nohuge; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(index, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + goto alloc_huge; + /* fallthrough */ + case SHMEM_HUGE_ADVISE: + if (sgp_huge == SGP_HUGE) + goto alloc_huge; + /* TODO: implement fadvise() hints */ + goto alloc_nohuge; + } + +alloc_huge: + page = shmem_alloc_and_acct_page(gfp, info, sbinfo, + index, true); + if (IS_ERR(page)) { +alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, + index, false); } - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) { - error = -ENOSPC; - goto unacct; + if (IS_ERR(page)) { + int retry = 5; + error = PTR_ERR(page); + page = NULL; + if (error != -ENOSPC) + goto failed; + /* + * Try to reclaim some spece by splitting a huge page + * beyond i_size on the filesystem. + */ + while (retry--) { + int ret; + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; } - percpu_counter_inc(&sbinfo->used_blocks); + goto failed; } - page = shmem_alloc_page(gfp, info, index); - if (!page) { - error = -ENOMEM; - goto decused; - } + if (PageTransHuge(page)) + hindex = round_down(index, HPAGE_PMD_NR); + else + hindex = index; + if (sgp == SGP_WRITE) __SetPageReferenced(page); error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, - false); + PageTransHuge(page)); if (error) - goto decused; - error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + goto unacct; + error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, + compound_order(page)); if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, + error = shmem_add_to_page_cache(page, mapping, hindex, NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - goto decused; + mem_cgroup_cancel_charge(page, memcg, + PageTransHuge(page)); + goto unacct; } - mem_cgroup_commit_charge(page, memcg, false, false); + mem_cgroup_commit_charge(page, memcg, false, + PageTransHuge(page)); lru_cache_add_anon(page); - spin_lock(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; + spin_lock_irq(&info->lock); + info->alloced += 1 << compound_order(page); + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); alloced = true; + if (PageTransHuge(page) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + hindex + HPAGE_PMD_NR - 1) { + /* + * Part of the huge page is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + if (list_empty(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. */ @@ -1309,10 +1782,15 @@ clear: * but SGP_FALLOC on a page fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ - if (sgp != SGP_WRITE) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + if (sgp != SGP_WRITE && !PageUptodate(page)) { + struct page *head = compound_head(page); + int i; + + for (i = 0; i < (1 << compound_order(head)); i++) { + clear_highpage(head + i); + flush_dcache_page(head + i); + } + SetPageUptodate(head); } } @@ -1322,24 +1800,30 @@ clear: if (alloced) { ClearPageDirty(page); delete_from_page_cache(page); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); } error = -EINVAL; goto unlock; } - *pagep = page; + *pagep = page + index - hindex; return 0; /* * Error recovery. */ -decused: - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); unacct: - shmem_unacct_blocks(info->flags, 1); + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, + 1 << compound_order(page)); + shmem_unacct_blocks(info->flags, 1 << compound_order(page)); + + if (PageTransHuge(page)) { + unlock_page(page); + put_page(page); + goto alloc_nohuge; + } failed: if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; @@ -1350,9 +1834,9 @@ unlock: } if (error == -ENOSPC && !once++) { info = SHMEM_I(inode); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); goto repeat; } if (error == -EEXIST) /* from above or from radix_tree_insert */ @@ -1364,6 +1848,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + enum sgp_type sgp; int error; int ret = VM_FAULT_LOCKED; @@ -1425,13 +1910,107 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + sgp = SGP_CACHE; + if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; + else if (vma->vm_flags & VM_NOHUGEPAGE) + sgp = SGP_NOHUGE; + + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret; } +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long uaddr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, + unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long addr; + unsigned long offset; + unsigned long inflated_len; + unsigned long inflated_addr; + unsigned long inflated_offset; + + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + addr = get_area(file, uaddr, len, pgoff, flags); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return addr; + if (IS_ERR_VALUE(addr)) + return addr; + if (addr & ~PAGE_MASK) + return addr; + if (addr > TASK_SIZE - len) + return addr; + + if (shmem_huge == SHMEM_HUGE_DENY) + return addr; + if (len < HPAGE_PMD_SIZE) + return addr; + if (flags & MAP_FIXED) + return addr; + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. + * But if caller specified an address hint, respect that as before. + */ + if (uaddr) + return addr; + + if (shmem_huge != SHMEM_HUGE_FORCE) { + struct super_block *sb; + + if (file) { + VM_BUG_ON(file->f_op != &shmem_file_operations); + sb = file_inode(file)->i_sb; + } else { + /* + * Called directly from mm/mmap.c, or drivers/char/mem.c + * for "/dev/zero", to create a shared anonymous object. + */ + if (IS_ERR(shm_mnt)) + return addr; + sb = shm_mnt->mnt_sb; + } + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + return addr; + } + + offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); + if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + return addr; + if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + return addr; + + inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + if (inflated_len > TASK_SIZE) + return addr; + if (inflated_len < len) + return addr; + + inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) + return addr; + + inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_addr += offset - inflated_offset; + if (inflated_offset > offset) + inflated_addr += HPAGE_PMD_SIZE; + + if (inflated_addr > TASK_SIZE - len) + return addr; + return inflated_addr; +} + #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -1456,7 +2035,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; - spin_lock(&info->lock); + spin_lock_irq(&info->lock); if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, user)) goto out_nomem; @@ -1471,7 +2050,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) retval = 0; out_nomem: - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); return retval; } @@ -1479,6 +2058,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } return 0; } @@ -1504,6 +2088,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); @@ -1589,12 +2174,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, i_size_write(inode, pos + copied); if (!PageUptodate(page)) { + struct page *head = compound_head(page); + if (PageTransCompound(page)) { + int i; + + for (i = 0; i < HPAGE_PMD_NR; i++) { + if (head + i == page) + continue; + clear_highpage(head + i); + flush_dcache_page(head + i); + } + } if (copied < PAGE_SIZE) { unsigned from = pos & (PAGE_SIZE - 1); zero_user_segments(page, 0, from, from + copied, PAGE_SIZE); } - SetPageUptodate(page); + SetPageUptodate(head); } set_page_dirty(page); unlock_page(page); @@ -2225,9 +2821,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ - shmem_undo_range(inode, - (loff_t)start << PAGE_SHIFT, - ((loff_t)index << PAGE_SHIFT) - 1, true); + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_SHIFT, + ((loff_t)index << PAGE_SHIFT) - 1, true); + } goto undone; } @@ -2858,11 +3456,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, sbinfo->gid = make_kgid(current_user_ns(), gid); if (!gid_valid(sbinfo->gid)) goto bad_val; +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + } else if (!strcmp(this_char, "huge")) { + int huge; + huge = shmem_parse_huge(value); + if (huge < 0) + goto bad_val; + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER) + goto bad_val; + sbinfo->huge = huge; +#endif +#ifdef CONFIG_NUMA } else if (!strcmp(this_char,"mpol")) { mpol_put(mpol); mpol = NULL; if (mpol_parse_str(value, &mpol)) goto bad_val; +#endif } else { pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; @@ -2908,6 +3519,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) goto out; error = 0; + sbinfo->huge = config.huge; sbinfo->max_blocks = config.max_blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -2941,6 +3553,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ + if (sbinfo->huge) + seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); +#endif shmem_show_mpol(seq, sbinfo->mpol); return 0; } @@ -3070,6 +3687,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; + spin_lock_init(&sbinfo->shrinklist_lock); + INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; @@ -3159,6 +3778,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, @@ -3231,6 +3851,10 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + .nr_cached_objects = shmem_unused_huge_count, + .free_cached_objects = shmem_unused_huge_scan, +#endif }; static const struct vm_operations_struct shmem_vm_ops = { @@ -3280,6 +3904,13 @@ int __init shmem_init(void) pr_err("Could not kern_mount tmpfs\n"); goto out1; } + +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + else + shmem_huge = 0; /* just in case it was patched */ +#endif return 0; out1: @@ -3291,6 +3922,91 @@ out3: return error; } +#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +static ssize_t shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int values[] = { + SHMEM_HUGE_ALWAYS, + SHMEM_HUGE_WITHIN_SIZE, + SHMEM_HUGE_ADVISE, + SHMEM_HUGE_NEVER, + SHMEM_HUGE_DENY, + SHMEM_HUGE_FORCE, + }; + int i, count; + + for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { + const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; + + count += sprintf(buf + count, fmt, + shmem_format_huge(values[i])); + } + buf[count - 1] = '\n'; + return count; +} + +static ssize_t shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char tmp[16]; + int huge; + + if (count + 1 > sizeof(tmp)) + return -EINVAL; + memcpy(tmp, buf, count); + tmp[count] = '\0'; + if (count && tmp[count - 1] == '\n') + tmp[count - 1] = '\0'; + + huge = shmem_parse_huge(tmp); + if (huge == -EINVAL) + return -EINVAL; + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + shmem_huge = huge; + if (shmem_huge < SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + return count; +} + +struct kobj_attribute shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); + +bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size; + pgoff_t off; + + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + return false; + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; + case SHMEM_HUGE_ADVISE: + /* TODO: implement fadvise() hints */ + return (vma->vm_flags & VM_HUGEPAGE); + default: + VM_BUG_ON(1); + return false; + } +} +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ + #else /* !CONFIG_SHMEM */ /* @@ -3333,6 +4049,15 @@ void shmem_unlock_mapping(struct address_space *mapping) { } +#ifdef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif + void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); @@ -3459,6 +4184,13 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < + (vma->vm_end & HPAGE_PMD_MASK)) { + khugepaged_enter(vma, vma->vm_flags); + } + return 0; } diff --git a/mm/slab.c b/mm/slab.c index cc8bbc1e6bc9..09771ed3e693 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1236,61 +1236,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } -#ifdef CONFIG_SLAB_FREELIST_RANDOM -static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, - size_t count) -{ - size_t i; - unsigned int rand; - - for (i = 0; i < count; i++) - list[i] = i; - - /* Fisher-Yates shuffle */ - for (i = count - 1; i > 0; i--) { - rand = prandom_u32_state(state); - rand %= (i + 1); - swap(list[i], list[rand]); - } -} - -/* Create a random sequence per cache */ -static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) -{ - unsigned int seed, count = cachep->num; - struct rnd_state state; - - if (count < 2) - return 0; - - /* If it fails, we will just use the global lists */ - cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); - if (!cachep->random_seq) - return -ENOMEM; - - /* Get best entropy at this stage */ - get_random_bytes_arch(&seed, sizeof(seed)); - prandom_seed_state(&state, seed); - - freelist_randomize(&state, cachep->random_seq, count); - return 0; -} - -/* Destroy the per-cache random freelist sequence */ -static void cache_random_seq_destroy(struct kmem_cache *cachep) -{ - kfree(cachep->random_seq); - cachep->random_seq = NULL; -} -#else -static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) -{ - return 0; -} -static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } -#endif /* CONFIG_SLAB_FREELIST_RANDOM */ - - /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -2535,7 +2480,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) union freelist_init_state { struct { unsigned int pos; - freelist_idx_t *list; + unsigned int *list; unsigned int count; unsigned int rand; }; @@ -2554,7 +2499,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, unsigned int rand; /* Use best entropy available to define a random shift */ - get_random_bytes_arch(&rand, sizeof(rand)); + rand = get_random_int(); /* Use a random state if the pre-computed list is not available */ if (!cachep->random_seq) { @@ -2576,13 +2521,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state) return (state->list[state->pos++] + state->rand) % state->count; } +/* Swap two freelist entries */ +static void swap_free_obj(struct page *page, unsigned int a, unsigned int b) +{ + swap(((freelist_idx_t *)page->freelist)[a], + ((freelist_idx_t *)page->freelist)[b]); +} + /* * Shuffle the freelist initialization state based on pre-computed lists. * return true if the list was successfully shuffled, false otherwise. */ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) { - unsigned int objfreelist = 0, i, count = cachep->num; + unsigned int objfreelist = 0, i, rand, count = cachep->num; union freelist_init_state state; bool precomputed; @@ -2607,7 +2559,15 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) * Later use a pre-computed list for speed. */ if (!precomputed) { - freelist_randomize(&state.rnd_state, page->freelist, count); + for (i = 0; i < count; i++) + set_free_obj(page, i, i); + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(&state.rnd_state); + rand %= (i + 1); + swap_free_obj(page, i, rand); + } } else { for (i = 0; i < count; i++) set_free_obj(page, i, next_random_slot(&state)); @@ -2726,8 +2686,11 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * critical path in kmem_cache_alloc(). */ if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3489,8 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, n->free_objects -= cachep->num; page = list_last_entry(&n->slabs_free, struct page, lru); - list_del(&page->lru); - list_add(&page->lru, list); + list_move(&page->lru, list); } } @@ -3979,7 +3941,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; - err = cache_random_seq_create(cachep, gfp); + err = cache_random_seq_create(cachep, cachep->num, gfp); if (err) goto end; diff --git a/mm/slab.h b/mm/slab.h index dedb1a920fb8..9653f2e2591a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -42,6 +42,7 @@ struct kmem_cache { #include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/kmemleak.h> +#include <linux/random.h> /* * State of the slab allocator. @@ -253,8 +254,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (is_root_cache(s)) return 0; - ret = __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); if (ret) return ret; @@ -268,6 +268,9 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { + if (!memcg_kmem_enabled()) + return; + memcg_kmem_update_page_stat(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, @@ -366,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; # endif + if (s->flags & SLAB_KASAN) + return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can @@ -390,7 +395,11 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (should_failslab(s, flags)) return NULL; - return memcg_kmem_get_cache(s, flags); + if (memcg_kmem_enabled() && + ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) + return memcg_kmem_get_cache(s); + + return s; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, @@ -407,7 +416,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, s->flags, flags); kasan_slab_alloc(s, object, flags); } - memcg_kmem_put_cache(s); + + if (memcg_kmem_enabled()) + memcg_kmem_put_cache(s); } #ifndef CONFIG_SLOB @@ -464,4 +475,17 @@ int memcg_slab_show(struct seq_file *m, void *p); void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index a65dad7fdcd1..71f0b28a1bec 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -526,8 +526,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; @@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) struct page *page; flags |= __GFP_COMP; - page = alloc_kmem_pages(flags, order); + page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); kasan_kmalloc_large(ret, size, flags); @@ -1030,6 +1030,53 @@ void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) EXPORT_SYMBOL(kmalloc_order_trace); #endif +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Randomize a generic freelist */ +static void freelist_randomize(struct rnd_state *state, unsigned int *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp) +{ + struct rnd_state state; + + if (count < 2 || cachep->random_seq) + return 0; + + cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage of boot */ + prandom_seed_state(&state, get_random_long()); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + #ifdef CONFIG_SLABINFO #ifdef CONFIG_SLAB diff --git a/mm/slub.c b/mm/slub.c index 825ff4505336..74e7c8c30db8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } -static inline void *fixup_red_left(struct kmem_cache *s, void *p) +inline void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) p += s->red_left_pad; @@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) */ #if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; -#elif defined(CONFIG_KASAN) -static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif @@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); + off += kasan_metadata_size(s); + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ print_section("Padding ", p + off, size_from_object(s) - off); @@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); + off += kasan_metadata_size(s); + if (size_from_object(s) == off) return 1; @@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline void slab_free_hook(struct kmem_cache *s, void *x) +static inline void *slab_free_hook(struct kmem_cache *s, void *x) { + void *freeptr; + kmemleak_free_recursive(x, s->flags); /* @@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + freeptr = get_freepointer(s, x); + /* + * kasan_slab_free() may put x into memory quarantine, delaying its + * reuse. In this case the object's freelist pointer is changed. + */ kasan_slab_free(s, x); + return freeptr; } static inline void slab_free_freelist_hook(struct kmem_cache *s, @@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, void *object = head; void *tail_obj = tail ? : head; + void *freeptr; do { - slab_free_hook(s, object); - } while ((object != tail_obj) && - (object = get_freepointer(s, object))); + freeptr = slab_free_hook(s, object); + } while ((object != tail_obj) && (object = freeptr)); #endif } @@ -1405,6 +1415,109 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, return page; } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Pre-initialize the random sequence cache */ +static int init_cache_random_seq(struct kmem_cache *s) +{ + int err; + unsigned long i, count = oo_objects(s->oo); + + err = cache_random_seq_create(s, count, GFP_KERNEL); + if (err) { + pr_err("SLUB: Unable to initialize free list for %s\n", + s->name); + return err; + } + + /* Transform to an offset on the set of pages */ + if (s->random_seq) { + for (i = 0; i < count; i++) + s->random_seq[i] *= s->size; + } + return 0; +} + +/* Initialize each random sequence freelist per cache */ +static void __init init_freelist_randomization(void) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) + init_cache_random_seq(s); + + mutex_unlock(&slab_mutex); +} + +/* Get the next entry on the pre-computed freelist randomized */ +static void *next_freelist_entry(struct kmem_cache *s, struct page *page, + unsigned long *pos, void *start, + unsigned long page_limit, + unsigned long freelist_count) +{ + unsigned int idx; + + /* + * If the target page allocation failed, the number of objects on the + * page might be smaller than the usual size defined by the cache. + */ + do { + idx = s->random_seq[*pos]; + *pos += 1; + if (*pos >= freelist_count) + *pos = 0; + } while (unlikely(idx >= page_limit)); + + return (char *)start + idx; +} + +/* Shuffle the single linked freelist based on a random pre-computed sequence */ +static bool shuffle_freelist(struct kmem_cache *s, struct page *page) +{ + void *start; + void *cur; + void *next; + unsigned long idx, pos, page_limit, freelist_count; + + if (page->objects < 2 || !s->random_seq) + return false; + + freelist_count = oo_objects(s->oo); + pos = get_random_int() % freelist_count; + + page_limit = page->objects * s->size; + start = fixup_red_left(s, page_address(page)); + + /* First entry is used as the base of the freelist */ + cur = next_freelist_entry(s, page, &pos, start, page_limit, + freelist_count); + page->freelist = cur; + + for (idx = 1; idx < page->objects; idx++) { + setup_object(s, page, cur); + next = next_freelist_entry(s, page, &pos, start, page_limit, + freelist_count); + set_freepointer(s, cur, next); + cur = next; + } + setup_object(s, page, cur); + set_freepointer(s, cur, NULL); + + return true; +} +#else +static inline int init_cache_random_seq(struct kmem_cache *s) +{ + return 0; +} +static inline void init_freelist_randomization(void) { } +static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; @@ -1412,6 +1525,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) gfp_t alloc_gfp; void *start, *p; int idx, order; + bool shuffle; flags &= gfp_allowed_mask; @@ -1473,15 +1587,19 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kasan_poison_slab(page); - for_each_object_idx(p, idx, s, start, page->objects) { - setup_object(s, page, p); - if (likely(idx < page->objects)) - set_freepointer(s, p, p + s->size); - else - set_freepointer(s, p, NULL); + shuffle = shuffle_freelist(s, page); + + if (!shuffle) { + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); + } + page->freelist = fixup_red_left(s, start); } - page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -1504,8 +1622,10 @@ out: static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); } return allocate_slab(s, @@ -2768,16 +2888,13 @@ slab_empty: * same page) possible by specifying head and tail ptr, plus objects * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, struct page *page, - void *head, void *tail, int cnt, - unsigned long addr) +static __always_inline void do_slab_free(struct kmem_cache *s, + struct page *page, void *head, void *tail, + int cnt, unsigned long addr) { void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - - slab_free_freelist_hook(s, head, tail); - redo: /* * Determine the currently cpus per cpu slab. @@ -2811,6 +2928,27 @@ redo: } +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) +{ + slab_free_freelist_hook(s, head, tail); + /* + * slab_free_freelist_hook() could have put the items into quarantine. + * If so, no need to free them. + */ + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + return; + do_slab_free(s, page, head, tail, cnt, addr); +} + +#ifdef CONFIG_KASAN +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); +} +#endif + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); @@ -2867,7 +3005,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); p[size] = NULL; /* mark object processed */ return size; } @@ -3207,6 +3345,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { + cache_random_seq_destroy(s); free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -3252,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->object_size; + size_t size = s->object_size; int order; /* @@ -3311,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the object. */ size += 2 * sizeof(struct track); +#endif + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch @@ -3431,6 +3573,13 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto error; + } + if (!init_kmem_cache_nodes(s)) goto error; @@ -3575,7 +3724,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; - page = alloc_kmem_pages_node(node, flags, get_order(size)); + page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3656,7 +3805,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_kmem_pages(page, compound_order(page)); + __free_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); @@ -3947,6 +4096,9 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); + /* Setup random freelists for each cache */ + init_freelist_randomization(); + #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif diff --git a/mm/sparse.c b/mm/sparse.c index 5d0cf4540364..36d7bbb80e49 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -/* - * Although written for the SPARSEMEM_EXTREME case, this happens - * to also work for the flat array case because - * NR_SECTION_ROOTS==NR_MEM_SECTIONS. - */ +#ifdef CONFIG_SPARSEMEM_EXTREME int __section_nr(struct mem_section* ms) { unsigned long root_nr; @@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms) return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } +#else +int __section_nr(struct mem_section* ms) +{ + return (int)(ms - mem_section[0]); +} +#endif /* * During early boot, before section_mem_map is used for an actual diff --git a/mm/swap.c b/mm/swap.c index 90530ff8ed16..75c63bb2a1da 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page) struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&zone->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irqsave(zone_lru_lock(zone), flags); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); } mem_cgroup_uncharge(page); } @@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void *arg) { int i; - struct zone *zone = NULL; + struct pglist_data *pgdat = NULL; struct lruvec *lruvec; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); + struct pglist_data *pagepgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = pagezone; - spin_lock_irqsave(&zone->lru_lock, flags); + if (pagepgdat != pgdat) { + if (pgdat) + spin_unlock_irqrestore(&pgdat->lru_lock, flags); + pgdat = pagepgdat; + spin_lock_irqsave(&pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); (*move_fn)(page, lruvec, arg); } - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); + if (pgdat) + spin_unlock_irqrestore(&pgdat->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } @@ -292,6 +292,7 @@ static bool need_activate_page_drain(int cpu) void activate_page(struct page *page) { + page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); @@ -316,9 +317,10 @@ void activate_page(struct page *page) { struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); - spin_unlock_irq(&zone->lru_lock); + page = compound_head(page); + spin_lock_irq(zone_lru_lock(zone)); + __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); + spin_unlock_irq(zone_lru_lock(zone)); } #endif @@ -443,16 +445,16 @@ void lru_cache_add(struct page *page) */ void add_page_to_unevictable_list(struct page *page) { - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); } /** @@ -728,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); - struct zone *zone = NULL; + struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); unsigned int uninitialized_var(lock_batch); @@ -739,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same zone. The lock is held only if zone != NULL. + * same pgdat. The lock is held only if pgdat != NULL. */ - if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; + if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } if (is_huge_zero_page(page)) { @@ -756,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold) continue; if (PageCompound(page)) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct zone *pagezone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, + if (pgdat != locked_pgdat) { + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); lock_batch = 0; - zone = pagezone; - spin_lock_irqsave(&zone->lru_lock, flags); + locked_pgdat = pgdat; + spin_lock_irqsave(&locked_pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -787,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold) list_add(&page->lru, &pages_to_free); } - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); @@ -824,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); + !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); if (!list) SetPageLRU(page_tail); diff --git a/mm/swap_state.c b/mm/swap_state.c index c99463ac02fb..c8310a37be3a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) entry.val, page); if (likely(!error)) { address_space->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_node_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } spin_unlock_irq(&address_space->tree_lock); @@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page) set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_node_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 031713ab40ce..78cfa292a29a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2493,7 +2493,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } /* frontswap enabled? set up bit-per-page map for frontswap */ - if (frontswap_enabled) + if (IS_ENABLED(CONFIG_FRONTSWAP)) frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { diff --git a/mm/truncate.c b/mm/truncate.c index 4064f8f53daa..a01cce450a26 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -155,10 +155,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { + loff_t holelen; + VM_BUG_ON_PAGE(PageTail(page), page); + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; if (page_mapped(page)) { unmap_mapping_range(mapping, (loff_t)page->index << PAGE_SHIFT, - PAGE_SIZE, 0); + holelen, 0); } return truncate_complete_page(mapping, page); } @@ -279,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -367,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -487,7 +491,21 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page->index != index); + + WARN_ON(page_to_pgoff(page) != index); + + /* Middle of THP: skip */ + if (PageTransTail(page)) { + unlock_page(page); + continue; + } else if (PageTransHuge(page)) { + index += HPAGE_PMD_NR - 1; + i += HPAGE_PMD_NR - 1; + /* 'end' is in the middle of THP */ + if (index == round_down(end, HPAGE_PMD_NR)) + continue; + } + ret = invalidate_inode_page(page); unlock_page(page); /* @@ -594,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page->index != index); + WARN_ON(page_to_pgoff(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; diff --git a/mm/util.c b/mm/util.c index 917e0e3d0f8e..662cddf914af 100644 --- a/mm/util.c +++ b/mm/util.c @@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page) } mapping = page->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; - return mapping; + + return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } +EXPORT_SYMBOL(page_mapping); /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) @@ -410,6 +412,12 @@ int __page_mapcount(struct page *page) int ret; ret = atomic_read(&page->_mapcount) + 1; + /* + * For file THP page->_mapcount contains total number of mapping + * of the page: no need to look into compound_mapcount. + */ + if (!PageAnon(page) && !PageHuge(page)) + return ret; page = compound_head(page); ret += atomic_read(compound_mapcount_ptr(page)) + 1; if (PageDoubleMap(page)) @@ -520,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); + free += global_node_page_state(NR_FILE_PAGES); /* * shmem pages shouldn't be counted as free in this @@ -528,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * that won't affect the overall amount of available * memory in the system. */ - free -= global_page_state(NR_SHMEM); + free -= global_node_page_state(NR_SHMEM); free += get_nr_swap_pages(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e11475cdeb7a..91f44e78c516 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_kmem_pages(page, 0); + __free_pages(page, 0); } kvfree(area->pages); @@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_kmem_pages(alloc_mask, order); + page = alloc_pages(alloc_mask, order); else - page = alloc_kmem_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ diff --git a/mm/vmscan.c b/mm/vmscan.c index c4a2f4512fca..650d26832569 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -84,6 +84,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* The highest zone to isolate pages for reclaim from */ + enum zone_type reclaim_idx; + unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc) } #endif +/* + * This misses isolated pages which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated pages will be a dominating factor. + */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); + if (get_nr_swap_pages() > 0) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); + + return nr; +} + +unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) +{ + unsigned long nr; + + nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_ISOLATED_ANON); + nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); return nr; } -bool zone_reclaimable(struct zone *zone) +bool pgdat_reclaimable(struct pglist_data *pgdat) { - return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < - zone_reclaimable_pages(zone) * 6; + return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < + pgdat_reclaimable_pages(pgdat) * 6; } unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); - return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); + return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } /* @@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page); - inc_zone_page_state(page, NR_VMSCAN_WRITE); + inc_node_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, @@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto keep_locked; @@ -1055,8 +1075,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Adding to swap updated mapping */ mapping = page_mapping(page); + } else if (unlikely(PageTransHuge(page))) { + /* Split file THP */ + if (split_huge_page_to_list(page, page_list)) + goto keep_locked; } + VM_BUG_ON_PAGE(PageTransHuge(page), page); + /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. @@ -1086,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() * except we already have the page isolated * and know it's dirty */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); goto keep_locked; @@ -1254,17 +1280,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !isolated_balloon_page(page)) { + !__PageMovable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } } - ret = shrink_page_list(&clean_pages, zone, &sc, + ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; } @@ -1342,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) return ret; } + /* - * zone->lru_lock is heavily contended. Some of the functions that + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken, + unsigned long nr_taken) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } + +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); +#endif +} + +/* + * zone_lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1369,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long scan; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long scan, nr_pages; + LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src); scan++) { + !list_empty(src);) { struct page *page; page = lru_to_page(src); @@ -1380,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + if (page_zonenum(page) > sc->reclaim_idx) { + list_move(&page->lru, &pages_skipped); + nr_skipped[page_zonenum(page)]++; + continue; + } + + /* + * Account for scanned and skipped separetly to avoid the pgdat + * being prematurely marked unreclaimable by pgdat_reclaimable. + */ + scan++; + switch (__isolate_lru_page(page, mode)) { case 0: - nr_taken += hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); break; @@ -1396,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } } + /* + * Splice any skipped pages to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same pages to skip and put the + * system at risk of premature OOM. + */ + if (!list_empty(&pages_skipped)) { + int zid; + unsigned long total_skipped = 0; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_skipped[zid]) + continue; + + __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); + total_skipped += nr_skipped[zid]; + } + + /* + * Account skipped pages as a partial scan as the pgdat may be + * close to unreclaimable. If the LRU list is empty, account + * skipped pages as a full scan. + */ + scan += list_empty(src) ? total_skipped : total_skipped >> 2; + + list_splice(&pages_skipped, src); + } *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); + update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); return nr_taken; } @@ -1438,8 +1533,8 @@ int isolate_lru_page(struct page *page) struct zone *zone = page_zone(page); struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(zone_lru_lock(zone)); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1447,7 +1542,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } return ret; } @@ -1459,7 +1554,7 @@ int isolate_lru_page(struct page *page) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct zone *zone, int file, +static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; @@ -1471,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file, return 0; if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* @@ -1493,7 +1588,7 @@ static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); /* @@ -1506,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); @@ -1529,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); } @@ -1557,8 +1652,32 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } +static bool inactive_reclaimable_pages(struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) +{ + int zid; + struct zone *zone; + int file = is_file_lru(lru); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!global_reclaim(sc)) + return true; + + for (zid = sc->reclaim_idx; zid >= 0; zid--) { + zone = &pgdat->node_zones[zid]; + if (!populated_zone(zone)) + continue; + + if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + + LRU_FILE * file) >= SWAP_CLUSTER_MAX) + return true; + } + + return false; +} + /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long @@ -1576,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + if (!inactive_reclaimable_pages(lruvec, sc, lru)) + return 0; + + while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ @@ -1594,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + __count_vm_events(PGSCAN_DIRECT, nr_scanned); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, &nr_dirty, &nr_unqueued_dirty, &nr_congested, &nr_writeback, &nr_immediate, false); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (global_reclaim(sc)) { if (current_is_kswapd()) - __count_zone_vm_events(PGSTEAL_KSWAPD, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); else - __count_zone_vm_events(PGSTEAL_DIRECT, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); @@ -1655,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - set_bit(ZONE_WRITEBACK, &zone->flags); + set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * Legacy memcg will stall in page writeback so avoid forcibly @@ -1667,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - set_bit(ZONE_CONGESTED, &zone->flags); + set_bit(PGDAT_CONGESTED, &pgdat->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_DIRTY and kswapd will start writing pages from + * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - set_bit(ZONE_DIRTY, &zone->flags); + set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked marked for immediate @@ -1695,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (!sc->hibernation_mode && !current_is_kswapd() && current_may_throttle()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, sc->priority, file); return nr_reclaimed; } @@ -1709,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone->lru_lock across the whole operation. But if + * appropriate to hold zone_lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone->lru_lock around each page. It's impossible to balance + * should drop zone_lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. @@ -1725,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *pages_to_free, enum lru_list lru) { - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long pgmoved = 0; struct page *page; int nr_pages; while (!list_empty(list)) { page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1748,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); } @@ -1777,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -1786,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); - __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); + __count_vm_events(PGREFILL, nr_scanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -1844,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); /* * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This @@ -1855,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan, move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); @@ -1888,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file) +static bool inactive_list_is_low(struct lruvec *lruvec, bool file, + struct scan_control *sc) { unsigned long inactive_ratio; unsigned long inactive; unsigned long active; unsigned long gb; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int zid; /* * If we don't have swap space, anonymous page deactivation @@ -1905,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file) inactive = lruvec_lru_size(lruvec, file * LRU_FILE); active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); + /* + * For zone-constrained allocations, it is necessary to check if + * deactivations are required for lowmem to be reclaimed. This + * calculates the inactive/active pages available in eligible zones. + */ + for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + unsigned long inactive_zone, active_zone; + + if (!populated_zone(zone)) + continue; + + inactive_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE)); + active_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + + inactive -= min(inactive, inactive_zone); + active -= min(active, active_zone); + } + gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); @@ -1918,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru))) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -1950,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; @@ -1971,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * well. */ if (current_is_kswapd()) { - if (!zone_reclaimable(zone)) + if (!pgdat_reclaimable(pgdat)) force_scan = true; if (!mem_cgroup_online(memcg)) force_scan = true; @@ -2017,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long zonefile; - unsigned long zonefree; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long total_high_wmark = 0; + + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); - zonefree = zone_page_state(zone, NR_FREE_PAGES); - zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; - if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { scan_balance = SCAN_ANON; goto out; } @@ -2039,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true) && + if (!inactive_list_is_low(lruvec, true, sc) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2071,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -2092,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -2168,12 +2321,12 @@ static inline void init_tlb_ubc(void) #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *lru_pages) { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2281,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2306,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_zone() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; + int z; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -2346,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) - inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order, 0, 0)) { - case COMPACT_PARTIAL: - case COMPACT_CONTINUE: - return false; - default: - return true; + for (z = 0; z <= sc->reclaim_idx; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; + + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + /* check next zone */ + ; + } } + return true; } -static bool shrink_zone(struct zone *zone, struct scan_control *sc, - bool is_classzone) +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; @@ -2373,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, do { struct mem_cgroup *root = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = sc->priority, }; - unsigned long zone_lru_pages = 0; + unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2397,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_zone_memcg(zone, memcg, sc, &lru_pages); - zone_lru_pages += lru_pages; + shrink_node_memcg(pgdat, memcg, sc, &lru_pages); + node_lru_pages += lru_pages; - if (memcg && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), + if (!global_reclaim(sc)) + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->nr_scanned - scanned, lru_pages); @@ -2413,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the - * zone. + * node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will @@ -2431,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + if (global_reclaim(sc)) + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, sc->nr_scanned - nr_scanned, - zone_lru_pages); + node_lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2449,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); return reclaimable; @@ -2459,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Returns true if compaction should go ahead for a high-order request, or * the high-order allocation would succeed without compaction. */ -static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { - unsigned long balance_gap, watermark; + unsigned long watermark; bool watermark_ok; /* @@ -2470,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); + watermark = high_wmark_pages(zone) + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, sc->order)) return watermark_ok; /* * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) + if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2497,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over high_wmark_pages(zone). - * Because: - * a) The caller may be trying to free *extra* pages to satisfy a higher-order - * allocation or - * b) The target zone may be at high_wmark_pages(zone) but the lower zones - * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' - * zone defense algorithm. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ @@ -2515,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; - enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + pg_data_t *last_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2523,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; - if (buffer_heads_over_limit) + if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; + sc->reclaim_idx = gfp_zone(sc->gfp_mask); + } for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - enum zone_type classzone_idx; - - if (!populated_zone(zone)) - continue; - - classzone_idx = requested_highidx; - while (!populated_zone(zone->zone_pgdat->node_zones + - classzone_idx)) - classzone_idx--; - + sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2548,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; if (sc->priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; /* Let kswapd poll it */ /* @@ -2562,20 +2705,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && - zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order, requested_highidx)) { + compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } /* + * Shrink each node in the zonelist once. If the + * zonelist is ordered by zone (not the default) then a + * node may be shrunk multiple times but in that case + * the user prefers lower zones being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; @@ -2583,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + /* See comment about same check for global reclaim above */ + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + shrink_node(zone->zone_pgdat, sc); } /* @@ -2619,7 +2774,7 @@ retry: delayacct_freepages_start(); if (global_reclaim(sc)) - count_vm_event(ALLOCSTALL); + __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, @@ -2686,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!populated_zone(zone) || - zone_reclaimable_pages(zone) == 0) + pgdat_reclaimable_pages(pgdat) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2701,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->classzone_idx = min(pgdat->classzone_idx, + pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, (enum zone_type)ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -2809,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, @@ -2827,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask); + gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2838,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, - struct zone *zone, + pg_data_t *pgdat, unsigned long *nr_scanned) { struct scan_control sc = { @@ -2848,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; unsigned long lru_pages; @@ -2857,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. - * if we don't reclaim here, the shrink_zone from balance_pgdat + * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone_memcg(zone, memcg, &sc, &lru_pages); + shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2886,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, @@ -2904,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2914,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc) +static void age_active_anon(struct pglist_data *pgdat, + struct scan_control *sc) { struct mem_cgroup *memcg; @@ -2923,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2933,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, bool highorder, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone) + balance_gap; + unsigned long mark = high_wmark_pages(zone); + + if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return false; /* - * When checking from pgdat_balanced(), kswapd should stop and sleep - * when it reaches the high order-0 watermark and let kcompactd take - * over. Other callers such as wakeup_kswapd() want to determine the - * true high-order watermark. + * If any eligible zone is balanced then the node is not considered + * to be congested or dirty */ - if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { - mark += (1UL << order); - order = 0; - } - - return zone_watermark_ok_safe(zone, order, mark, classzone_idx); -} - -/* - * pgdat_balanced() is used when checking if a node is balanced. - * - * For order-0, all zones must be balanced! - * - * For high-order allocations only zones that meet watermarks and are in a - * zone allowed by the callers classzone_idx are added to balanced_pages. The - * total of balanced pages must be at least 25% of the zones allowed by - * classzone_idx for the node to be considered balanced. Forcing all zones to - * be balanced for high orders can cause excessive reclaim when there are - * imbalanced zones. - * The choice of 25% is due to - * o a 16M DMA zone that is balanced will not balance a zone on any - * reasonable sized machine - * o On all other machines, the top zone must be at least a reasonable - * percentage of the middle zones. For example, on 32-bit x86, highmem - * would need to be at least 256M for it to be balance a whole node. - * Similarly, on x86-64 the Normal zone would need to be at least 1G - * to balance a node on its own. These seemed like reasonable ratios. - */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) -{ - unsigned long managed_pages = 0; - unsigned long balanced_pages = 0; - int i; + clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); + clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - managed_pages += zone->managed_pages; - - /* - * A special case here: - * - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well! - */ - if (!zone_reclaimable(zone)) { - balanced_pages += zone->managed_pages; - continue; - } - - if (zone_balanced(zone, order, false, 0, i)) - balanced_pages += zone->managed_pages; - else if (!order) - return false; - } - - if (order) - return balanced_pages >= (managed_pages >> 2); - else - return true; + return true; } /* @@ -3017,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return false; + int i; /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3040,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - return pgdat_balanced(pgdat, order, classzone_idx); + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (!zone_balanced(zone, order, classzone_idx)) + return false; + } + + return true; } /* - * kswapd shrinks the zone by the number of pages required to reach - * the high watermark. + * kswapd shrinks a node of pages that are at or below the highest usable + * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ -static bool kswapd_shrink_zone(struct zone *zone, - int classzone_idx, +static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { - unsigned long balance_gap; - bool lowmem_pressure; + struct zone *zone; + int z; - /* Reclaim above the high watermark. */ - sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; + for (z = 0; z <= sc->reclaim_idx; z++) { + zone = pgdat->node_zones + z; + if (!populated_zone(zone)) + continue; - /* - * We put equal pressure on every zone, unless one zone has way too - * many pages free already. The "too many pages" is defined as the - * high wmark plus a "gap" where the gap is either the low - * watermark or 1% of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); + } /* - * If there is no low memory pressure or the zone is balanced then no - * reclaim is necessary + * Historically care was taken to put equal pressure on all zones but + * now pressure is applied based on node LRU order. */ - lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, sc->order, false, - balance_gap, classzone_idx)) - return true; - - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - - clear_bit(ZONE_WRITEBACK, &zone->flags); + shrink_node(pgdat, sc); /* - * If a zone reaches its high watermark, consider it to be no longer - * congested. It's possible there are dirty pages backed by congested - * BDIs but as pressure is relieved, speculatively avoid congestion - * waits. + * Fragmentation may mean that the system cannot be rebalanced for + * high-order allocations. If twice the allocation size has been + * reclaimed then recheck watermarks only at order-0 to prevent + * excessive reclaim. Assume that a process requested a high-order + * can direct reclaim/compact. */ - if (zone_reclaimable(zone) && - zone_balanced(zone, sc->order, false, 0, classzone_idx)) { - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; } /* - * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at high_wmark_pages(zone). + * For kswapd, balance_pgdat() will reclaim pages across a node from zones + * that are eligible for use by the caller until at least one zone is + * balanced. * - * Returns the highest zone idx kswapd was reclaiming at - * - * There is special handling here for zones which are full of pinned pages. - * This can happen if the pages are all mlocked, or if they are all used by - * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. - * What we do is to detect the case where all pages in the zone have been - * scanned twice and there has been zero successful reclaim. Mark the zone as - * dead and from now on, only perform a short scan. Basically we're polling - * the zone for when the problem goes away. + * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the - * lower zones regardless of the number of free pages in the lower zones. This - * interoperates with the page allocator fallback scheme to ensure that aging - * of pages is balanced across the zones. + * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * or lower is eligible for reclaim until at least one usable zone is + * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, @@ -3139,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) bool raise_priority = true; sc.nr_reclaimed = 0; + sc.reclaim_idx = classzone_idx; /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning + * If the number of buffer_heads exceeds the maximum allowed + * then consider reclaiming from all zones. This has a dual + * purpose -- on 64-bit systems it is expected that + * buffer_heads are stripped during active rotation. On 32-bit + * systems, highmem pages can pin lowmem memory and shrinking + * buffers can relieve lowmem pressure. Reclaim may still not + * go ahead if all eligible zones for the original allocation + * request are balanced to avoid excessive reclaim from kswapd. */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. - */ - age_active_anon(zone, &sc); + if (buffer_heads_over_limit) { + for (i = MAX_NR_ZONES - 1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; - /* - * If the number of buffer_heads in the machine - * exceeds the maximum allowed level and this node - * has a highmem zone, force kswapd to reclaim from - * it to relieve lowmem pressure. - */ - if (buffer_heads_over_limit && is_highmem_idx(i)) { - end_zone = i; + sc.reclaim_idx = i; break; } + } - if (!zone_balanced(zone, order, false, 0, 0)) { - end_zone = i; - break; - } else { - /* - * If balanced, clear the dirty and congested - * flags - */ - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + /* + * Only reclaim if there are no eligible zones. Check from + * high to low zone as allocations prefer higher zones. + * Scanning from low to high zone would allow congestion to be + * cleared during a very small window when a small low + * zone was balanced even under extreme pressure when the + * overall node may be congested. Note that sc.reclaim_idx + * is not used as buffer_heads_over_limit may have adjusted + * it. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + + if (zone_balanced(zone, sc.order, classzone_idx)) + goto out; } - if (i < 0) - goto out; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. All + * pages are rotated regardless of classzone as this is + * about consistent aging. + */ + age_active_anon(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2) + if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) sc.may_writepage = 1; + /* Call soft limit reclaim before calling shrink_node. */ + sc.nr_scanned = 0; + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. + * There should be no need to raise the scanning priority if + * enough pages are already being scanned that that high + * watermark would be met at 100% efficiency. */ - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - sc.nr_scanned = 0; - - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - - /* - * There should be no need to raise the scanning - * priority if enough pages are already being scanned - * that that high watermark would be met at 100% - * efficiency. - */ - if (kswapd_shrink_zone(zone, end_zone, &sc)) - raise_priority = false; - } + if (kswapd_shrink_node(pgdat, &sc)) + raise_priority = false; /* * If the low watermark is met there is no need for processes @@ -3253,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; - } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, classzone_idx)); + } while (sc.priority >= 1); out: /* - * Return the highest zone idx we were reclaiming at so - * prepare_kswapd_sleep() makes the same decisions as here. + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller + * entered the allocator slow path while kswapd was awake, order will + * remain at the higher level. */ - return end_zone; + return sc.order; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, - int classzone_idx, int balanced_classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, + unsigned int classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3276,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3290,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, classzone_idx); remaining = schedule_timeout(HZ/10); + + /* + * If woken prematurely then reset kswapd_classzone_idx and + * order. The values will either be from a wakeup request or + * the previous request that slept prematurely. + */ + if (remaining) { + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); + } + finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } @@ -3301,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (!remaining && + prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3343,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, */ static int kswapd(void *p) { - unsigned long order, new_order; - int classzone_idx, new_classzone_idx; - int balanced_classzone_idx; + unsigned int alloc_order, reclaim_order, classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3375,38 +3449,20 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = new_order = 0; - classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; - balanced_classzone_idx = classzone_idx; + pgdat->kswapd_order = alloc_order = reclaim_order = 0; + pgdat->kswapd_classzone_idx = classzone_idx = 0; for ( ; ; ) { bool ret; - /* - * While we were reclaiming, there might have been another - * wakeup, so check the values. - */ - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; +kswapd_try_sleep: + kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, + classzone_idx); - if (order < new_order || classzone_idx > new_classzone_idx) { - /* - * Don't sleep if someone wants a larger 'order' - * allocation or has tigher zone constraints - */ - order = new_order; - classzone_idx = new_classzone_idx; - } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx, - balanced_classzone_idx); - order = pgdat->kswapd_max_order; - classzone_idx = pgdat->classzone_idx; - new_order = order; - new_classzone_idx = classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + /* Read the new order and classzone_idx */ + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = 0; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3416,11 +3472,25 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) { - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = balance_pgdat(pgdat, order, - classzone_idx); - } + if (ret) + continue; + + /* + * Reclaim begins at the requested order but if a high-order + * reclaim fails then kswapd falls back to reclaiming for + * order-0. If that happens, kswapd will consider sleeping + * for the order it finished reclaiming at (reclaim_order) + * but kcompactd is woken to compact for the original + * request (alloc_order). + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + alloc_order); + reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; + + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3436,6 +3506,7 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + int z; if (!populated_zone(zone)) return; @@ -3443,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) { - pgdat->kswapd_max_order = order; - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); - } + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, true, 0, 0)) - return; + + /* Only wake kswapd if all zones are unbalanced */ + for (z = 0; z <= classzone_idx; z++) { + zone = pgdat->node_zones + z; + if (!populated_zone(zone)) + continue; + + if (zone_balanced(zone, order, classzone_idx)) + return; + } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); @@ -3471,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, @@ -3572,12 +3650,12 @@ module_init(kswapd_init) #ifdef CONFIG_NUMA /* - * Zone reclaim mode + * Node reclaim mode * - * If non-zero call zone_reclaim when the number of free pages falls below + * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ -int zone_reclaim_mode __read_mostly; +int node_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ @@ -3585,14 +3663,14 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* - * Priority for ZONE_RECLAIM. This determines the fraction of pages + * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ -#define ZONE_RECLAIM_PRIORITY 4 +#define NODE_RECLAIM_PRIORITY 4 /* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; @@ -3603,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; -static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { - unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); + unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); + unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than @@ -3618,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static unsigned long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; @@ -3626,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about - * pages like swapcache and zone_unmapped_file_pages() provides + * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_UNMAP) - nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else - nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ - if (!(zone_reclaim_mode & RECLAIM_WRITE)) - delta += zone_page_state(zone, NR_FILE_DIRTY); + if (!(node_reclaim_mode & RECLAIM_WRITE)) + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -3646,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) } /* - * Try to free up some pages from this zone through reclaim. + * Try to free up some pages from this node through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; + int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, - .priority = ZONE_RECLAIM_PRIORITY, - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, + .reclaim_idx = classzone_idx, }; cond_resched(); @@ -3675,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc, true); + shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } @@ -3691,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return sc.nr_reclaimed >= nr_pages; } -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { - int node_id; int ret; /* - * Zone reclaim reclaims unmapped file backed pages and + * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately - * thrown out if the zone is overallocated. So we do not reclaim - * if less than a specified percentage of the zone is used by + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by * unmapped file backed pages. */ - if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return ZONE_RECLAIM_FULL; + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; - if (!zone_reclaimable(zone)) - return ZONE_RECLAIM_FULL; + if (!pgdat_reclaimable(pgdat)) + return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) - return ZONE_RECLAIM_NOSCAN; + return NODE_RECLAIM_NOSCAN; /* - * Only run zone reclaim on the local zone or on zones that do not + * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone_to_nid(zone); - if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return ZONE_RECLAIM_NOSCAN; + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) - return ZONE_RECLAIM_NOSCAN; + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; - ret = __zone_reclaim(zone, gfp_mask, order); - clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + ret = __node_reclaim(pgdat, gfp_mask, order); + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3772,24 +3850,23 @@ int page_evictable(struct page *page) void check_move_unevictable_pages(struct page **pages, int nr_pages) { struct lruvec *lruvec; - struct zone *zone = NULL; + struct pglist_data *pgdat = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; - struct zone *pagezone; + struct pglist_data *pagepgdat = page_pgdat(page); pgscanned++; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (pagepgdat != pgdat) { + if (pgdat) + spin_unlock_irq(&pgdat->lru_lock); + pgdat = pagepgdat; + spin_lock_irq(&pgdat->lru_lock); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (!PageLRU(page) || !PageUnevictable(page)) continue; @@ -3805,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } - if (zone) { + if (pgdat) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); } } #endif /* CONFIG_SHMEM */ diff --git a/mm/vmstat.c b/mm/vmstat.c index cb2a67bb4158..89cec42d19ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu) * * vm_stat contains the global counters */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -EXPORT_SYMBOL(vm_stat); +atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone) */ void refresh_zone_stat_thresholds(void) { + struct pglist_data *pgdat; struct zone *zone; int cpu; int threshold; + /* Zero current pgdat thresholds */ + for_each_online_pgdat(pgdat) { + for_each_online_cpu(cpu) { + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; + } + } + for_each_populated_zone(zone) { + struct pglist_data *pgdat = zone->zone_pgdat; unsigned long max_drift, tolerate_drift; threshold = calculate_normal_threshold(zone); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + int pgdat_threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; + /* Base nodestat threshold on the largest populated zone. */ + pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold + = max(threshold, pgdat_threshold); + } + /* * Only set percpu_drift_mark if there is a danger that * NR_FREE_PAGES reports the low watermark is ok when in fact @@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(__mod_zone_page_state); +void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long x; + long t; + + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { + node_page_state_add(x, pgdat, item); + x = 0; + } + __this_cpu_write(*p, x); +} +EXPORT_SYMBOL(__mod_node_page_state); + /* * Optimized increment and decrement functions. * @@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } +} + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__inc_zone_page_state); +void __inc_node_page_state(struct page *page, enum node_stat_item item) +{ + __inc_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__inc_node_page_state); + void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; @@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } +} + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__dec_zone_page_state); +void __dec_node_page_state(struct page *page, enum node_stat_item item) +{ + __dec_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__dec_node_page_state); + #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead @@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, enum zone_stat_item item, - long delta, int overstep_mode) +static inline void mod_zone_state(struct zone *zone, + enum zone_stat_item item, long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item, void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - mod_state(zone, item, delta, 0); + mod_zone_state(zone, item, delta, 0); } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - mod_state(zone, item, 1, 1); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, 1, 1); + mod_zone_state(page_zone(page), item, 1, 1); } EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, -1, -1); + mod_zone_state(page_zone(page), item, -1, -1); } EXPORT_SYMBOL(dec_zone_page_state); + +static inline void mod_node_state(struct pglist_data *pgdat, + enum node_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to node counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a node. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to node counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + node_page_state_add(z, pgdat, item); +} + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + mod_node_state(pgdat, item, delta, 0); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + mod_node_state(pgdat, item, 1, 1); +} + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_node_page_state); #else /* * Use interrupt disable to serialize counter updates @@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_zone_state(zone, item); - local_irq_restore(flags); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; @@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); -#endif +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_state); + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_node_page_state(pgdat, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + struct pglist_data *pgdat; + + pgdat = page_pgdat(page); + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_node_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_node_page_state); +#endif /* * Fold a differential into the global counters. * Returns the number of counters updated. */ -static int fold_diff(int *diff) +static int fold_diff(int *zone_diff, int *node_diff) { int i; int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) { - atomic_long_add(diff[i], &vm_stat[i]); + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); changes++; } return changes; @@ -462,9 +641,11 @@ static int fold_diff(int *diff) */ static int refresh_cpu_vm_stats(bool do_pagesets) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; for_each_populated_zone(zone) { @@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (v) { atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ __this_cpu_write(p->expire, 3); @@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } #endif } - changes += fold_diff(global_diff); + + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); + if (v) { + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + } + + changes += fold_diff(global_zone_diff, global_node_diff); return changes; } @@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets) */ void cpu_vm_stats_fold(int cpu) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { struct per_cpu_pageset *p; @@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu) v = p->vm_stat_diff[i]; p->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; } } - fold_diff(global_diff); + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (p->vm_node_stat_diff[i]) { + int v; + + v = p->vm_node_stat_diff[i]; + p->vm_node_stat_diff[i] = 0; + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + + fold_diff(global_zone_diff, global_node_diff); } /* @@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) int v = pset->vm_stat_diff[i]; pset->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_stat[i]); + atomic_long_add(v, &vm_zone_stat[i]); } } #endif #ifdef CONFIG_NUMA /* - * Determine the per node value of a stat item. + * Determine the per node value of a stat item. This function + * is called frequently in a NUMA machine, so try to be as + * frugal as possible. */ -unsigned long node_page_state(int node, enum zone_stat_item item) +unsigned long sum_zone_node_page_state(int node, + enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; int i; @@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item) return count; } +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} #endif #ifdef CONFIG_COMPACTION @@ -691,34 +921,21 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", - "nr_alloc_batch", - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", + "nr_zone_inactive_anon", + "nr_zone_active_anon", + "nr_zone_inactive_file", + "nr_zone_active_file", + "nr_zone_unevictable", + "nr_zone_write_pending", "nr_mlock", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", - "nr_unstable", "nr_bounce", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_writeback_temp", - "nr_isolated_anon", - "nr_isolated_file", - "nr_shmem", - "nr_dirtied", - "nr_written", - "nr_pages_scanned", - +#if IS_ENABLED(CONFIG_ZSMALLOC) + "nr_zspages", +#endif #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -727,11 +944,35 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_free_cma", + + /* Node-based counters */ + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_isolated_anon", + "nr_isolated_file", + "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_writeback_temp", + "nr_shmem", + "nr_shmem_hugepages", + "nr_shmem_pmdmapped", "nr_anon_transparent_hugepages", - "nr_free_cma", + "nr_unstable", + "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", + "nr_dirtied", + "nr_written", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -745,6 +986,8 @@ const char * const vmstat_text[] = { "pswpout", TEXTS_FOR_ZONES("pgalloc") + TEXTS_FOR_ZONES("allocstall") + TEXTS_FOR_ZONES("pgskip") "pgfree", "pgactivate", @@ -754,11 +997,11 @@ const char * const vmstat_text[] = { "pgmajfault", "pglazyfreed", - TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal_kswapd") - TEXTS_FOR_ZONES("pgsteal_direct") - TEXTS_FOR_ZONES("pgscan_kswapd") - TEXTS_FOR_ZONES("pgscan_direct") + "pgrefill", + "pgsteal_kswapd", + "pgsteal_direct", + "pgscan_kswapd", + "pgscan_direct", "pgscan_direct_throttle", #ifdef CONFIG_NUMA @@ -770,7 +1013,6 @@ const char * const vmstat_text[] = { "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", "pageoutrun", - "allocstall", "pgrotated", @@ -815,6 +1057,8 @@ const char * const vmstat_text[] = { "thp_fault_fallback", "thp_collapse_alloc", "thp_collapse_alloc_failed", + "thp_file_alloc", + "thp_file_mapped", "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", @@ -1174,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; +static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *compare = &pgdat->node_zones[zid]; + + if (populated_zone(compare)) + return zone == compare; + } + + /* The zone must be somewhere! */ + WARN_ON_ONCE(1); + return false; +} + static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { int i; seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + if (is_zone_first_populated(pgdat, zone)) { + seq_printf(m, "\n per-node stats"); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + node_page_state(pgdat, i)); + } + } seq_printf(m, "\n pages free %lu" "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu" + "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1192,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone_page_state(zone, NR_PAGES_SCANNED), + node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], + seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); seq_printf(m, @@ -1228,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, - "\n all_unreclaimable: %u" - "\n start_pfn: %lu" - "\n inactive_ratio: %u", - !zone_reclaimable(zone), + "\n node_unreclaimable: %u" + "\n start_pfn: %lu" + "\n node_inactive_ratio: %u", + !pgdat_reclaimable(zone->zone_pgdat), zone->zone_start_pfn, - zone->inactive_ratio); + zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } @@ -1281,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS @@ -1295,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[i] = global_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + v[i] = global_node_page_state(i); + v += NR_VM_NODE_STAT_ITEMS; + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); v += NR_VM_WRITEBACK_STAT_ITEMS; @@ -1319,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); return 0; } @@ -1384,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_stat[i]); + val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { switch (i) { - case NR_ALLOC_BATCH: case NR_PAGES_SCANNED: /* - * These are often seen to go negative in + * This is often seen to go negative in * recent kernels, but not to go permanently * negative. Whilst it would be nicer not to * have exceptions, rooting them out would be diff --git a/mm/workingset.c b/mm/workingset.c index 8a75f8d2916a..69551cfae97b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,7 +16,7 @@ /* * Double CLOCK lists * - * Per zone, two clock lists are maintained for file pages: the + * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list @@ -141,11 +141,11 @@ * * Implementation * - * For each zone's file LRU lists, a counter for inactive evictions - * and activations is maintained (zone->inactive_age). + * For each node's file LRU lists, a counter for inactive evictions + * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the zone) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache radix tree * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible @@ -153,7 +153,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - ZONES_SHIFT + NODES_SHIFT + \ + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) @@ -167,33 +167,30 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); - eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, +static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - int memcgid, nid, zid; + int memcgid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; - zid = entry & ((1UL << ZONES_SHIFT) - 1); - entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; - *zonep = NODE_DATA(nid)->node_zones + zid; + *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; } @@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, void *workingset_eviction(struct address_space *mapping, struct page *page) { struct mem_cgroup *memcg = page_memcg(page); - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, zone, eviction); + return pack_shadow(memcgid, pgdat, eviction); } /** @@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the zone it was allocated in. + * evicted page in the context of the node it was allocated in. * * Returns %true if the page should be activated, %false otherwise. */ @@ -240,10 +237,10 @@ bool workingset_refault(void *shadow) unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct zone *zone; + struct pglist_data *pgdat; int memcgid; - unpack_shadow(shadow, &memcgid, &zone, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction); rcu_read_lock(); /* @@ -267,7 +264,7 @@ bool workingset_refault(void *shadow) rcu_read_unlock(); return false; } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); rcu_read_unlock(); @@ -290,10 +287,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_zone_state(zone, WORKINGSET_REFAULT); + inc_node_state(pgdat, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_zone_state(zone, WORKINGSET_ACTIVATE); + inc_node_state(pgdat, WORKINGSET_ACTIVATE); return true; } return false; @@ -305,9 +302,10 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { + struct mem_cgroup *memcg; struct lruvec *lruvec; - lock_page_memcg(page); + rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. @@ -315,12 +313,13 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - if (!mem_cgroup_disabled() && !page_memcg(page)) + memcg = page_memcg_rcu(page); + if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); atomic_long_inc(&lruvec->inactive_age); out: - unlock_page_memcg(page); + rcu_read_unlock(); } /* @@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - if (memcg_kmem_enabled()) + if (memcg_kmem_enabled()) { pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, LRU_ALL_FILE); - else - pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + - node_page_state(sc->nid, NR_INACTIVE_FILE); + } else { + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } /* * Active cache pages are limited to 50% of memory, and shadow @@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } } BUG_ON(node->count); - inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); if (!__radix_tree_delete_node(&mapping->page_tree, node)) BUG(); @@ -491,7 +491,7 @@ static int __init workingset_init(void) max_order = fls_long(totalram_pages - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; - printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b6d4f258cb53..b0bc023d25c5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,32 +16,16 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->private: points to the first component (0-order) page - * page->index (union with page->freelist): offset of the first object - * starting in this page. For the first page, this is - * always 0, so we use this field (aka freelist) to point - * to the first free object in zspage. - * page->lru: links together all component pages (except the first page) - * of a zspage - * - * For _first_ page only: - * - * page->private: refers to the component page after the first page - * If the page is first_page for huge object, it stores handle. - * Look at size_class->huge. - * page->freelist: points to the first free object in zspage. - * Free objects are linked together using in-place - * metadata. - * page->objects: maximum number of objects we can store in this - * zspage (class->zspage_order * PAGE_SIZE / class->size) - * page->lru: links together first pages of various zspages. - * Basically forming list of zspages in a fullness group. - * page->mapping: class index and fullness group of the zspage - * page->inuse: the number of objects that are used in this zspage + * page->private: points to zspage + * page->freelist(index): links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. + * page->units: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page + * PG_owner_priv_1: indentifies the huge component page * */ @@ -66,6 +50,11 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> +#include <linux/mount.h> +#include <linux/migrate.h> +#include <linux/pagemap.h> + +#define ZSPAGE_MAGIC 0x58 /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -88,9 +77,7 @@ * Object location (<PFN>, <obj_idx>) is encoded as * as single (unsigned long) handle value. * - * Note that object index <obj_idx> is relative to system - * page <PFN> it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. + * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE. */ @@ -149,33 +136,26 @@ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above) */ -#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) -/* - * We do not maintain any list for completely empty or full pages - */ enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - ZS_EMPTY, - ZS_FULL + ZS_ALMOST_EMPTY, + ZS_ALMOST_FULL, + ZS_FULL, + NR_ZS_FULLNESS, }; enum zs_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, + CLASS_FULL, OBJ_ALLOCATED, OBJ_USED, - CLASS_ALMOST_FULL, - CLASS_ALMOST_EMPTY, + NR_ZS_STAT_TYPE, }; -#ifdef CONFIG_ZSMALLOC_STAT -#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) -#else -#define NR_ZS_STAT_TYPE (OBJ_USED + 1) -#endif - struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -184,6 +164,10 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif +#ifdef CONFIG_COMPACTION +static struct vfsmount *zsmalloc_mnt; +#endif + /* * number of size_classes */ @@ -207,35 +191,49 @@ static const int fullness_threshold_frac = 4; struct size_class { spinlock_t lock; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. */ int size; - unsigned int index; - - struct zs_size_stat stats; - + int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ - bool huge; + + unsigned int index; + struct zs_size_stat stats; }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetPageHugeObject(struct page *page) +{ + SetPageOwnerPriv1(page); +} + +static void ClearPageHugeObject(struct page *page) +{ + ClearPageOwnerPriv1(page); +} + +static int PageHugeObject(struct page *page) +{ + return PageOwnerPriv1(page); +} + /* * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. + * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { union { /* - * Position of next free chunk (encodes <PFN, obj_idx>) + * Free object index; * It's valid for non-allocated object */ - void *next; + unsigned long next; /* * Handle of allocated object. */ @@ -248,6 +246,7 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; + struct kmem_cache *zspage_cachep; atomic_long_t pages_allocated; @@ -263,16 +262,36 @@ struct zs_pool { #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif +#ifdef CONFIG_COMPACTION + struct inode *inode; + struct work_struct free_work; +#endif }; /* * A zspage's class index and fullness group * are encoded in its (first)page->mapping */ -#define CLASS_IDX_BITS 28 -#define FULLNESS_BITS 4 -#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) -#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + +struct zspage { + struct { + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS; + unsigned int isolated:ISOLATED_BITS; + unsigned int magic:MAGIC_VAL_BITS; + }; + unsigned int inuse; + unsigned int freeobj; + struct page *first_page; + struct list_head list; /* fullness list */ +#ifdef CONFIG_COMPACTION + rwlock_t lock; +#endif +}; struct mapping_area { #ifdef CONFIG_PGTABLE_MAPPING @@ -284,29 +303,74 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; -static int create_handle_cache(struct zs_pool *pool) +#ifdef CONFIG_COMPACTION +static int zs_register_migration(struct zs_pool *pool); +static void zs_unregister_migration(struct zs_pool *pool); +static void migrate_lock_init(struct zspage *zspage); +static void migrate_read_lock(struct zspage *zspage); +static void migrate_read_unlock(struct zspage *zspage); +static void kick_deferred_free(struct zs_pool *pool); +static void init_deferred_free(struct zs_pool *pool); +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +#else +static int zsmalloc_mount(void) { return 0; } +static void zsmalloc_unmount(void) {} +static int zs_register_migration(struct zs_pool *pool) { return 0; } +static void zs_unregister_migration(struct zs_pool *pool) {} +static void migrate_lock_init(struct zspage *zspage) {} +static void migrate_read_lock(struct zspage *zspage) {} +static void migrate_read_unlock(struct zspage *zspage) {} +static void kick_deferred_free(struct zs_pool *pool) {} +static void init_deferred_free(struct zs_pool *pool) {} +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +#endif + +static int create_cache(struct zs_pool *pool) { pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0, NULL); - return pool->handle_cachep ? 0 : 1; + if (!pool->handle_cachep) + return 1; + + pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), + 0, 0, NULL); + if (!pool->zspage_cachep) { + kmem_cache_destroy(pool->handle_cachep); + pool->handle_cachep = NULL; + return 1; + } + + return 0; } -static void destroy_handle_cache(struct zs_pool *pool) +static void destroy_cache(struct zs_pool *pool) { kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->zspage_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) +static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~__GFP_HIGHMEM); + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } -static void free_handle(struct zs_pool *pool, unsigned long handle) +static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); } +static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) +{ + return kmem_cache_alloc(pool->zspage_cachep, + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +}; + +static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +{ + kmem_cache_free(pool->zspage_cachep, zspage); +} + static void record_obj(unsigned long handle, unsigned long obj) { /* @@ -401,46 +465,79 @@ static struct zpool_driver zs_zpool_driver = { MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ -static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) -{ - return pages_per_zspage * PAGE_SIZE / size; -} - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static bool is_zspage_isolated(struct zspage *zspage) +{ + return zspage->isolated; +} + static int is_first_page(struct page *page) { return PagePrivate(page); } -static int is_last_page(struct page *page) +/* Protected by class->lock */ +static inline int get_zspage_inuse(struct zspage *zspage) +{ + return zspage->inuse; +} + +static inline void set_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse = val; +} + +static inline void mod_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse += val; +} + +static inline struct page *get_first_page(struct zspage *zspage) +{ + struct page *first_page = zspage->first_page; + + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + return first_page; +} + +static inline int get_first_obj_offset(struct page *page) +{ + return page->units; +} + +static inline void set_first_obj_offset(struct page *page, int offset) +{ + page->units = offset; +} + +static inline unsigned int get_freeobj(struct zspage *zspage) { - return PagePrivate2(page); + return zspage->freeobj; } -static void get_zspage_mapping(struct page *first_page, +static inline void set_freeobj(struct zspage *zspage, unsigned int obj) +{ + zspage->freeobj = obj; +} + +static void get_zspage_mapping(struct zspage *zspage, unsigned int *class_idx, enum fullness_group *fullness) { - unsigned long m; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + BUG_ON(zspage->magic != ZSPAGE_MAGIC); - m = (unsigned long)first_page->mapping; - *fullness = m & FULLNESS_MASK; - *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; + *fullness = zspage->fullness; + *class_idx = zspage->class; } -static void set_zspage_mapping(struct page *first_page, +static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) { - unsigned long m; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | - (fullness & FULLNESS_MASK); - first_page->mapping = (struct address_space *)m; + zspage->class = class_idx; + zspage->fullness = fullness; } /* @@ -464,23 +561,19 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] += cnt; + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] -= cnt; + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - if (type < NR_ZS_STAT_TYPE) - return class->stats.objs[type]; - return 0; + return class->stats.objs[type]; } #ifdef CONFIG_ZSMALLOC_STAT @@ -535,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) freeable = zs_can_compact(class); spin_unlock(&class->lock); - objs_per_zspage = get_maxobj_per_zspage(class->size, - class->pages_per_zspage); + objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; @@ -624,6 +716,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) } #endif + /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -631,21 +724,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *first_page) +static enum fullness_group get_fullness_group(struct size_class *class, + struct zspage *zspage) { - int inuse, max_objects; + int inuse, objs_per_zspage; enum fullness_group fg; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - inuse = first_page->inuse; - max_objects = first_page->objects; + inuse = get_zspage_inuse(zspage); + objs_per_zspage = class->objs_per_zspage; if (inuse == 0) fg = ZS_EMPTY; - else if (inuse == max_objects) + else if (inuse == objs_per_zspage) fg = ZS_FULL; - else if (inuse <= 3 * max_objects / fullness_threshold_frac) + else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -660,32 +752,25 @@ static enum fullness_group get_fullness_group(struct page *first_page) * identified by <class, fullness_group>. */ static void insert_zspage(struct size_class *class, - enum fullness_group fullness, - struct page *first_page) + struct zspage *zspage, + enum fullness_group fullness) { - struct page **head; - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); - - head = &class->fullness_list[fullness]; - if (!*head) { - *head = first_page; - return; - } + struct zspage *head; + zs_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); /* - * We want to see more ZS_FULL pages and less almost - * empty/full. Put pages with higher ->inuse first. + * We want to see more ZS_FULL pages and less almost empty/full. + * Put pages with higher ->inuse first. */ - list_add_tail(&first_page->lru, &(*head)->lru); - if (first_page->inuse >= (*head)->inuse) - *head = first_page; + if (head) { + if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { + list_add(&zspage->list, &head->list); + return; + } + } + list_add(&zspage->list, &class->fullness_list[fullness]); } /* @@ -693,27 +778,14 @@ static void insert_zspage(struct size_class *class, * by <class, fullness_group>. */ static void remove_zspage(struct size_class *class, - enum fullness_group fullness, - struct page *first_page) + struct zspage *zspage, + enum fullness_group fullness) { - struct page **head; + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); + VM_BUG_ON(is_zspage_isolated(zspage)); - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - VM_BUG_ON_PAGE(!*head, first_page); - if (list_empty(&(*head)->lru)) - *head = NULL; - else if (*head == first_page) - *head = (struct page *)list_entry((*head)->lru.next, - struct page, lru); - - list_del_init(&first_page->lru); - zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + list_del_init(&zspage->list); + zs_stat_dec(class, fullness, 1); } /* @@ -726,19 +798,22 @@ static void remove_zspage(struct size_class *class, * fullness group. */ static enum fullness_group fix_fullness_group(struct size_class *class, - struct page *first_page) + struct zspage *zspage) { int class_idx; enum fullness_group currfg, newfg; - get_zspage_mapping(first_page, &class_idx, &currfg); - newfg = get_fullness_group(first_page); + get_zspage_mapping(zspage, &class_idx, &currfg); + newfg = get_fullness_group(class, zspage); if (newfg == currfg) goto out; - remove_zspage(class, currfg, first_page); - insert_zspage(class, newfg, first_page); - set_zspage_mapping(first_page, class_idx, newfg); + if (!is_zspage_isolated(zspage)) { + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); + } + + set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; @@ -780,64 +855,49 @@ static int get_pages_per_zspage(int class_size) return max_usedpc_order; } -/* - * A single 'zspage' is composed of many system pages which are - * linked together using fields in struct page. This function finds - * the first/head page, given any component page of a zspage. - */ -static struct page *get_first_page(struct page *page) +static struct zspage *get_zspage(struct page *page) { - if (is_first_page(page)) - return page; - else - return (struct page *)page_private(page); + struct zspage *zspage = (struct zspage *)page->private; + + BUG_ON(zspage->magic != ZSPAGE_MAGIC); + return zspage; } static struct page *get_next_page(struct page *page) { - struct page *next; + if (unlikely(PageHugeObject(page))) + return NULL; - if (is_last_page(page)) - next = NULL; - else if (is_first_page(page)) - next = (struct page *)page_private(page); - else - next = list_entry(page->lru.next, struct page, lru); + return page->freelist; +} - return next; +/** + * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * @page: page object resides in zspage + * @obj_idx: object index + */ +static void obj_to_location(unsigned long obj, struct page **page, + unsigned int *obj_idx) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); } -/* - * Encode <page, obj_idx> as a single handle value. - * We use the least bit of handle for tagging. +/** + * location_to_obj - get obj value encoded from (<page>, <obj_idx>) + * @page: page object resides in zspage + * @obj_idx: object index */ -static void *location_to_obj(struct page *page, unsigned long obj_idx) +static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) { unsigned long obj; - if (!page) { - VM_BUG_ON(obj_idx); - return NULL; - } - obj = page_to_pfn(page) << OBJ_INDEX_BITS; - obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj |= obj_idx & OBJ_INDEX_MASK; obj <<= OBJ_TAG_BITS; - return (void *)obj; -} - -/* - * Decode <page, obj_idx> pair from the given object handle. We adjust the - * decoded obj_idx back to its original value since it was adjusted in - * location_to_obj(). - */ -static void obj_to_location(unsigned long obj, struct page **page, - unsigned long *obj_idx) -{ - obj >>= OBJ_TAG_BITS; - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); - *obj_idx = (obj & OBJ_INDEX_MASK); + return obj; } static unsigned long handle_to_obj(unsigned long handle) @@ -845,109 +905,146 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct size_class *class, struct page *page, - void *obj) +static unsigned long obj_to_head(struct page *page, void *obj) { - if (class->huge) { + if (unlikely(PageHugeObject(page))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page_private(page); + return page->index; } else return *(unsigned long *)obj; } -static unsigned long obj_idx_to_offset(struct page *page, - unsigned long obj_idx, int class_size) +static inline int testpin_tag(unsigned long handle) { - unsigned long off = 0; - - if (!is_first_page(page)) - off = page->index; - - return off + obj_idx * class_size; + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); } static inline int trypin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void pin_tag(unsigned long handle) { - while (!trypin_tag(handle)); + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void unpin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - clear_bit_unlock(HANDLE_PIN_BIT, ptr); + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void reset_page(struct page *page) { - clear_bit(PG_private, &page->flags); - clear_bit(PG_private_2, &page->flags); + __ClearPageMovable(page); + ClearPagePrivate(page); + ClearPagePrivate2(page); set_page_private(page, 0); - page->mapping = NULL; - page->freelist = NULL; page_mapcount_reset(page); + ClearPageHugeObject(page); + page->freelist = NULL; } -static void free_zspage(struct page *first_page) +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +void lock_zspage(struct zspage *zspage) { - struct page *nextp, *tmp, *head_extra; + struct page *page = get_first_page(zspage); - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - VM_BUG_ON_PAGE(first_page->inuse, first_page); + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); +} - head_extra = (struct page *)page_private(first_page); +int trylock_zspage(struct zspage *zspage) +{ + struct page *cursor, *fail; - reset_page(first_page); - __free_page(first_page); + for (cursor = get_first_page(zspage); cursor != NULL; cursor = + get_next_page(cursor)) { + if (!trylock_page(cursor)) { + fail = cursor; + goto unlock; + } + } - /* zspage with only 1 system page */ - if (!head_extra) - return; + return 1; +unlock: + for (cursor = get_first_page(zspage); cursor != fail; cursor = + get_next_page(cursor)) + unlock_page(cursor); - list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { - list_del(&nextp->lru); - reset_page(nextp); - __free_page(nextp); + return 0; +} + +static void __free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + struct page *page, *next; + enum fullness_group fg; + unsigned int class_idx; + + get_zspage_mapping(zspage, &class_idx, &fg); + + assert_spin_locked(&class->lock); + + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(fg != ZS_EMPTY); + + next = page = get_first_page(zspage); + do { + VM_BUG_ON_PAGE(!PageLocked(page), page); + next = get_next_page(page); + reset_page(page); + unlock_page(page); + dec_zone_page_state(page, NR_ZSPAGES); + put_page(page); + page = next; + } while (page != NULL); + + cache_free_zspage(pool, zspage); + + zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); +} + +static void free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; } - reset_page(head_extra); - __free_page(head_extra); + + remove_zspage(class, zspage, ZS_EMPTY); + __free_zspage(pool, class, zspage); } /* Initialize a newly allocated zspage */ -static void init_zspage(struct size_class *class, struct page *first_page) +static void init_zspage(struct size_class *class, struct zspage *zspage) { + unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = first_page; - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + struct page *page = get_first_page(zspage); while (page) { struct page *next_page; struct link_free *link; - unsigned int i = 1; void *vaddr; - /* - * page->index stores offset of first object starting - * in the page. For the first page, this is always 0, - * so we use first_page->index (aka ->freelist) to store - * head of corresponding zspage's freelist. - */ - if (page != first_page) - page->index = off; + set_first_obj_offset(page, off); vaddr = kmap_atomic(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = location_to_obj(page, i++); + link->next = freeobj++ << OBJ_TAG_BITS; link += class->size / sizeof(*link); } @@ -957,87 +1054,112 @@ static void init_zspage(struct size_class *class, struct page *first_page) * page (if present) */ next_page = get_next_page(page); - link->next = location_to_obj(next_page, 0); + if (next_page) { + link->next = freeobj++ << OBJ_TAG_BITS; + } else { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + link->next = -1 << OBJ_TAG_BITS; + } kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } + + set_freeobj(zspage, 0); } -/* - * Allocate a zspage for the given size class - */ -static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +static void create_page_chain(struct size_class *class, struct zspage *zspage, + struct page *pages[]) { - int i, error; - struct page *first_page = NULL, *uninitialized_var(prev_page); + int i; + struct page *page; + struct page *prev_page = NULL; + int nr_pages = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. first page->private = first sub-page - * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->private + * 1. all pages are linked together using page->freelist + * 2. each sub-page point to zspage using page->private * - * For each size class, First/Head pages are linked together using - * page->lru. Also, we set PG_private to identify the first page - * (i.e. no other sub-page has this flag set) and PG_private_2 to - * identify the last page. + * we set PG_private to identify the first page (i.e. no other sub-page + * has this flag set) and PG_private_2 to identify the last page. */ - error = -ENOMEM; - for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; - - page = alloc_page(flags); - if (!page) - goto cleanup; - - INIT_LIST_HEAD(&page->lru); - if (i == 0) { /* first page */ + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + set_page_private(page, (unsigned long)zspage); + page->freelist = NULL; + if (i == 0) { + zspage->first_page = page; SetPagePrivate(page); - set_page_private(page, 0); - first_page = page; - first_page->inuse = 0; + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) + SetPageHugeObject(page); + } else { + prev_page->freelist = page; } - if (i == 1) - set_page_private(first_page, (unsigned long)page); - if (i >= 1) - set_page_private(page, (unsigned long)first_page); - if (i >= 2) - list_add(&page->lru, &prev_page->lru); - if (i == class->pages_per_zspage - 1) /* last page */ + if (i == nr_pages - 1) SetPagePrivate2(page); prev_page = page; } +} - init_zspage(class, first_page); +/* + * Allocate a zspage for the given size class + */ +static struct zspage *alloc_zspage(struct zs_pool *pool, + struct size_class *class, + gfp_t gfp) +{ + int i; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zspage *zspage = cache_alloc_zspage(pool, gfp); + + if (!zspage) + return NULL; + + memset(zspage, 0, sizeof(struct zspage)); + zspage->magic = ZSPAGE_MAGIC; + migrate_lock_init(zspage); - first_page->freelist = location_to_obj(first_page, 0); - /* Maximum number of objects we can store in this zspage */ - first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; - error = 0; /* Success */ + page = alloc_page(gfp); + if (!page) { + while (--i >= 0) { + dec_zone_page_state(pages[i], NR_ZSPAGES); + __free_page(pages[i]); + } + cache_free_zspage(pool, zspage); + return NULL; + } -cleanup: - if (unlikely(error) && first_page) { - free_zspage(first_page); - first_page = NULL; + inc_zone_page_state(page, NR_ZSPAGES); + pages[i] = page; } - return first_page; + create_page_chain(class, zspage, pages); + init_zspage(class, zspage); + + return zspage; } -static struct page *find_get_zspage(struct size_class *class) +static struct zspage *find_get_zspage(struct size_class *class) { int i; - struct page *page; + struct zspage *zspage; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) + for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + zspage = list_first_entry_or_null(&class->fullness_list[i], + struct zspage, list); + if (zspage) break; } - return page; + return zspage; } #ifdef CONFIG_PGTABLE_MAPPING @@ -1219,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void) cpu_notifier_register_done(); } -static void init_zs_size_classes(void) +static void __init init_zs_size_classes(void) { int nr; @@ -1230,23 +1352,19 @@ static void init_zs_size_classes(void) zs_size_classes = nr; } -static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +static bool can_merge(struct size_class *prev, int pages_per_zspage, + int objs_per_zspage) { - if (prev->pages_per_zspage != pages_per_zspage) - return false; + if (prev->pages_per_zspage == pages_per_zspage && + prev->objs_per_zspage == objs_per_zspage) + return true; - if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) - != get_maxobj_per_zspage(size, pages_per_zspage)) - return false; - - return true; + return false; } -static bool zspage_full(struct page *first_page) +static bool zspage_full(struct size_class *class, struct zspage *zspage) { - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - - return first_page->inuse == first_page->objects; + return get_zspage_inuse(zspage) == class->objs_per_zspage; } unsigned long zs_get_total_pages(struct zs_pool *pool) @@ -1272,8 +1390,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; @@ -1294,9 +1414,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + + /* migration cannot move any subpage in this zspage */ + migrate_read_lock(zspage); + + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = &get_cpu_var(zs_map_area); area->vm_mm = mm; @@ -1314,7 +1439,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (!class->huge) + if (likely(!PageHugeObject(page))) ret += ZS_HANDLE_SIZE; return ret; @@ -1323,8 +1448,10 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; @@ -1333,9 +1460,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1350,38 +1478,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + + migrate_read_unlock(zspage); unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); static unsigned long obj_malloc(struct size_class *class, - struct page *first_page, unsigned long handle) + struct zspage *zspage, unsigned long handle) { + int i, nr_page, offset; unsigned long obj; struct link_free *link; struct page *m_page; - unsigned long m_objidx, m_offset; + unsigned long m_offset; void *vaddr; handle |= OBJ_ALLOCATED_TAG; - obj = (unsigned long)first_page->freelist; - obj_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + obj = get_freeobj(zspage); + + offset = obj * class->size; + nr_page = offset >> PAGE_SHIFT; + m_offset = offset & ~PAGE_MASK; + m_page = get_first_page(zspage); + + for (i = 0; i < nr_page; i++) + m_page = get_next_page(m_page); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); - first_page->freelist = link->next; - if (!class->huge) + set_freeobj(zspage, link->next >> OBJ_TAG_BITS); + if (likely(!PageHugeObject(m_page))) /* record handle in the header of allocated chunk */ link->handle = handle; else - /* record handle in first_page->private */ - set_page_private(first_page, handle); + /* record handle to page->index */ + zspage->first_page->index = handle; + kunmap_atomic(vaddr); - first_page->inuse++; + mod_zspage_inuse(zspage, 1); zs_stat_inc(class, OBJ_USED, 1); + obj = location_to_obj(m_page, obj); + return obj; } @@ -1390,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class, * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from * @size: size of block to allocate + * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, * otherwise 0. @@ -1399,12 +1540,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; - struct page *first_page; + enum fullness_group newfg; + struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool, gfp); + handle = cache_alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,29 +1555,37 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { + zspage = find_get_zspage(class); + if (likely(zspage)) { + obj = obj_malloc(class, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); spin_unlock(&class->lock); - first_page = alloc_zspage(class, gfp); - if (unlikely(!first_page)) { - free_handle(pool, handle); - return 0; - } - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - atomic_long_add(class->pages_per_zspage, - &pool->pages_allocated); + return handle; + } - spin_lock(&class->lock); - zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); + spin_unlock(&class->lock); + + zspage = alloc_zspage(pool, class, gfp); + if (!zspage) { + cache_free_handle(pool, handle); + return 0; } - obj = obj_malloc(class, first_page, handle); - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(class, first_page); + spin_lock(&class->lock); + obj = obj_malloc(class, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); spin_unlock(&class->lock); return handle; @@ -1445,36 +1595,38 @@ EXPORT_SYMBOL_GPL(zs_malloc); static void obj_free(struct size_class *class, unsigned long obj) { struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; + struct zspage *zspage; + struct page *f_page; + unsigned long f_offset; + unsigned int f_objidx; void *vaddr; obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + f_offset = (class->size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = first_page->freelist; - if (class->huge) - set_page_private(first_page, 0); + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; kunmap_atomic(vaddr); - first_page->freelist = (void *)obj; - first_page->inuse--; + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) { - struct page *first_page, *f_page; - unsigned long obj, f_objidx; + struct zspage *zspage; + struct page *f_page; + unsigned long obj; + unsigned int f_objidx; int class_idx; struct size_class *class; enum fullness_group fullness; + bool isolated; if (unlikely(!handle)) return; @@ -1482,25 +1634,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle) pin_tag(handle); obj = handle_to_obj(handle); obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); + zspage = get_zspage(f_page); + + migrate_read_lock(zspage); - get_zspage_mapping(first_page, &class_idx, &fullness); + get_zspage_mapping(zspage, &class_idx, &fullness); class = pool->size_class[class_idx]; spin_lock(&class->lock); obj_free(class, obj); - fullness = fix_fullness_group(class, first_page); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); - free_zspage(first_page); + fullness = fix_fullness_group(class, zspage); + if (fullness != ZS_EMPTY) { + migrate_read_unlock(zspage); + goto out; } + + isolated = is_zspage_isolated(zspage); + migrate_read_unlock(zspage); + /* If zspage is isolated, zs_page_putback will free the zspage */ + if (likely(!isolated)) + free_zspage(pool, class, zspage); +out: + spin_unlock(&class->lock); unpin_tag(handle); - - free_handle(pool, handle); + cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1508,7 +1666,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { struct page *s_page, *d_page; - unsigned long s_objidx, d_objidx; + unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; int s_size, d_size, size; @@ -1519,8 +1677,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = obj_idx_to_offset(s_page, s_objidx, class->size); - d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + s_off = (class->size * s_objidx) & ~PAGE_MASK; + d_off = (class->size * d_objidx) & ~PAGE_MASK; if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; @@ -1572,19 +1730,19 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int index) + struct page *page, int *obj_idx) { unsigned long head; int offset = 0; + int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); - if (!is_first_page(page)) - offset = page->index; + offset = get_first_obj_offset(page); offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(class, page, addr + offset); + head = obj_to_head(page, addr + offset); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; if (trypin_tag(handle)) @@ -1597,18 +1755,21 @@ static unsigned long find_alloced_obj(struct size_class *class, } kunmap_atomic(addr); + + *obj_idx = index; + return handle; } struct zs_compact_control { - /* Source page for migration which could be a subpage of zspage. */ + /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; /* Destination page for migration which should be a first page * of zspage. */ struct page *d_page; /* Starting object index within @s_page which used for live object * in the subpage. */ - int index; + int obj_idx; }; static int migrate_zspage(struct zs_pool *pool, struct size_class *class, @@ -1618,30 +1779,30 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, unsigned long handle; struct page *s_page = cc->s_page; struct page *d_page = cc->d_page; - unsigned long index = cc->index; + int obj_idx = cc->obj_idx; int ret = 0; while (1) { - handle = find_alloced_obj(class, s_page, index); + handle = find_alloced_obj(class, s_page, &obj_idx); if (!handle) { s_page = get_next_page(s_page); if (!s_page) break; - index = 0; + obj_idx = 0; continue; } /* Stop if there is no more space */ - if (zspage_full(d_page)) { + if (zspage_full(class, get_zspage(d_page))) { unpin_tag(handle); ret = -ENOMEM; break; } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, d_page, handle); + free_obj = obj_malloc(class, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); - index++; + obj_idx++; /* * record_obj updates handle's value to free_obj and it will * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which @@ -1656,73 +1817,426 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Remember last position in this iteration */ cc->s_page = s_page; - cc->index = index; + cc->obj_idx = obj_idx; return ret; } -static struct page *isolate_target_page(struct size_class *class) +static struct zspage *isolate_zspage(struct size_class *class, bool source) { int i; - struct page *page; + struct zspage *zspage; + enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) { - remove_zspage(class, i, page); - break; + if (!source) { + fg[0] = ZS_ALMOST_FULL; + fg[1] = ZS_ALMOST_EMPTY; + } + + for (i = 0; i < 2; i++) { + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { + VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; } } - return page; + return zspage; } /* - * putback_zspage - add @first_page into right class's fullness list - * @pool: target pool + * putback_zspage - add @zspage into right class's fullness list * @class: destination class - * @first_page: target page + * @zspage: target page * - * Return @fist_page's fullness_group + * Return @zspage's fullness_group */ -static enum fullness_group putback_zspage(struct zs_pool *pool, - struct size_class *class, - struct page *first_page) +static enum fullness_group putback_zspage(struct size_class *class, + struct zspage *zspage) { enum fullness_group fullness; - fullness = get_fullness_group(first_page); - insert_zspage(class, fullness, first_page); - set_zspage_mapping(first_page, class->index, fullness); + VM_BUG_ON(is_zspage_isolated(zspage)); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); + + return fullness; +} + +#ifdef CONFIG_COMPACTION +static struct dentry *zs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; - free_zspage(first_page); + return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); +} + +static struct file_system_type zsmalloc_fs = { + .name = "zsmalloc", + .mount = zs_mount, + .kill_sb = kill_anon_super, +}; + +static int zsmalloc_mount(void) +{ + int ret = 0; + + zsmalloc_mnt = kern_mount(&zsmalloc_fs); + if (IS_ERR(zsmalloc_mnt)) + ret = PTR_ERR(zsmalloc_mnt); + + return ret; +} + +static void zsmalloc_unmount(void) +{ + kern_unmount(zsmalloc_mnt); +} + +static void migrate_lock_init(struct zspage *zspage) +{ + rwlock_init(&zspage->lock); +} + +static void migrate_read_lock(struct zspage *zspage) +{ + read_lock(&zspage->lock); +} + +static void migrate_read_unlock(struct zspage *zspage) +{ + read_unlock(&zspage->lock); +} + +static void migrate_write_lock(struct zspage *zspage) +{ + write_lock(&zspage->lock); +} + +static void migrate_write_unlock(struct zspage *zspage) +{ + write_unlock(&zspage->lock); +} + +/* Number of isolated subpage for *page migration* in this zspage */ +static void inc_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated++; +} + +static void dec_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated--; +} + +static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) +{ + struct page *page; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + int idx = 0; + + page = get_first_page(zspage); + do { + if (page == oldpage) + pages[idx] = newpage; + else + pages[idx] = page; + idx++; + } while ((page = get_next_page(page)) != NULL); + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + if (unlikely(PageHugeObject(oldpage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, page_mapping(oldpage)); +} + +bool zs_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct address_space *mapping; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at + * lock_zspage in free_zspage. + */ + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); + + /* + * Without class lock, fullness could be stale while class_idx is okay + * because class_idx is constant unless page is freed so we should get + * fullness again under class lock. + */ + get_zspage_mapping(zspage, &class_idx, &fullness); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + if (get_zspage_inuse(zspage) == 0) { + spin_unlock(&class->lock); + return false; } - return fullness; + /* zspage is isolated for object migration */ + if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + spin_unlock(&class->lock); + return false; + } + + /* + * If this is first time isolation for the zspage, isolate zspage from + * size_class to prevent further object allocation from the zspage. + */ + if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + get_zspage_mapping(zspage, &class_idx, &fullness); + remove_zspage(class, zspage, fullness); + } + + inc_zspage_isolation(zspage); + spin_unlock(&class->lock); + + return true; +} + +int zs_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; + int offset, pos; + unsigned long handle, head; + unsigned long old_obj, new_obj; + unsigned int obj_idx; + int ret = -EAGAIN; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + + /* Concurrent compactor cannot migrate any subpage in zspage */ + migrate_write_lock(zspage); + get_zspage_mapping(zspage, &class_idx, &fullness); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + offset = get_first_obj_offset(page); + + spin_lock(&class->lock); + if (!get_zspage_inuse(zspage)) { + ret = -EBUSY; + goto unlock_class; + } + + pos = offset; + s_addr = kmap_atomic(page); + while (pos < PAGE_SIZE) { + head = obj_to_head(page, s_addr + pos); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!trypin_tag(handle)) + goto unpin_objects; + } + pos += class->size; + } + + /* + * Here, any user cannot access all objects in the zspage so let's move. + */ + d_addr = kmap_atomic(newpage); + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); + new_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, new_obj); + } + } + + replace_sub_page(class, zspage, newpage, page); + get_page(newpage); + + dec_zspage_isolation(zspage); + + /* + * Page migration is done so let's putback isolated zspage to + * the list if @page is final isolated subpage in the zspage. + */ + if (!is_zspage_isolated(zspage)) + putback_zspage(class, zspage); + + reset_page(page); + put_page(page); + page = newpage; + + ret = MIGRATEPAGE_SUCCESS; +unpin_objects: + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + unpin_tag(handle); + } + } + kunmap_atomic(s_addr); +unlock_class: + spin_unlock(&class->lock); + migrate_write_unlock(zspage); + + return ret; } -static struct page *isolate_source_page(struct size_class *class) +void zs_page_putback(struct page *page) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fg; + struct address_space *mapping; + struct zspage *zspage; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + dec_zspage_isolation(zspage); + if (!is_zspage_isolated(zspage)) { + fg = putback_zspage(class, zspage); + /* + * Due to page_lock, we cannot free zspage immediately + * so let's defer. + */ + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + } + spin_unlock(&class->lock); +} + +const struct address_space_operations zsmalloc_aops = { + .isolate_page = zs_page_isolate, + .migratepage = zs_page_migrate, + .putback_page = zs_page_putback, +}; + +static int zs_register_migration(struct zs_pool *pool) +{ + pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &zsmalloc_aops; + return 0; +} + +static void zs_unregister_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); + iput(pool->inode); +} + +/* + * Caller should hold page_lock of all pages in the zspage + * In here, we cannot use zspage meta data. + */ +static void async_free_zspage(struct work_struct *work) { int i; - struct page *page = NULL; + struct size_class *class; + unsigned int class_idx; + enum fullness_group fullness; + struct zspage *zspage, *tmp; + LIST_HEAD(free_pages); + struct zs_pool *pool = container_of(work, struct zs_pool, + free_work); - for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { - page = class->fullness_list[i]; - if (!page) + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + if (class->index != i) continue; - remove_zspage(class, i, page); - break; + spin_lock(&class->lock); + list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); + spin_unlock(&class->lock); + } + + + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); + + get_zspage_mapping(zspage, &class_idx, &fullness); + VM_BUG_ON(fullness != ZS_EMPTY); + class = pool->size_class[class_idx]; + spin_lock(&class->lock); + __free_zspage(pool, pool->size_class[class_idx], zspage); + spin_unlock(&class->lock); } +}; + +static void kick_deferred_free(struct zs_pool *pool) +{ + schedule_work(&pool->free_work); +} + +static void init_deferred_free(struct zs_pool *pool) +{ + INIT_WORK(&pool->free_work, async_free_zspage); +} + +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); - return page; + do { + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); } +#endif /* * @@ -1739,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class) return 0; obj_wasted = obj_allocated - obj_used; - obj_wasted /= get_maxobj_per_zspage(class->size, - class->pages_per_zspage); + obj_wasted /= class->objs_per_zspage; return obj_wasted * class->pages_per_zspage; } @@ -1748,20 +2261,20 @@ static unsigned long zs_can_compact(struct size_class *class) static void __zs_compact(struct zs_pool *pool, struct size_class *class) { struct zs_compact_control cc; - struct page *src_page; - struct page *dst_page = NULL; + struct zspage *src_zspage; + struct zspage *dst_zspage = NULL; spin_lock(&class->lock); - while ((src_page = isolate_source_page(class))) { + while ((src_zspage = isolate_zspage(class, true))) { if (!zs_can_compact(class)) break; - cc.index = 0; - cc.s_page = src_page; + cc.obj_idx = 0; + cc.s_page = get_first_page(src_zspage); - while ((dst_page = isolate_target_page(class))) { - cc.d_page = dst_page; + while ((dst_zspage = isolate_zspage(class, false))) { + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched * and see if anyone had allocated another zspage. @@ -1769,23 +2282,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) if (!migrate_zspage(pool, class, &cc)) break; - putback_zspage(pool, class, dst_page); + putback_zspage(class, dst_zspage); } /* Stop if we couldn't find slot */ - if (dst_page == NULL) + if (dst_zspage == NULL) break; - putback_zspage(pool, class, dst_page); - if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + putback_zspage(class, dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + free_zspage(pool, class, src_zspage); pool->stats.pages_compacted += class->pages_per_zspage; + } spin_unlock(&class->lock); cond_resched(); spin_lock(&class->lock); } - if (src_page) - putback_zspage(pool, class, src_page); + if (src_zspage) + putback_zspage(class, src_zspage); spin_unlock(&class->lock); } @@ -1874,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool) /** * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata + * @name: pool name to be created * * This function must be called before anything when using * the zsmalloc allocator. @@ -1892,6 +2407,7 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool) return NULL; + init_deferred_free(pool); pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { @@ -1903,7 +2419,7 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; - if (create_handle_cache(pool)) + if (create_cache(pool)) goto err; /* @@ -1913,12 +2429,15 @@ struct zs_pool *zs_create_pool(const char *name) for (i = zs_size_classes - 1; i >= 0; i--) { int size; int pages_per_zspage; + int objs_per_zspage; struct size_class *class; + int fullness = 0; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; pages_per_zspage = get_pages_per_zspage(size); + objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* * size_class is used for normal zsmalloc operation such @@ -1930,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name) * previous size_class if possible. */ if (prev_class) { - if (can_merge(prev_class, size, pages_per_zspage)) { + if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } @@ -1943,11 +2462,12 @@ struct zs_pool *zs_create_pool(const char *name) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; - if (pages_per_zspage == 1 && - get_maxobj_per_zspage(size, pages_per_zspage) == 1) - class->huge = true; + class->objs_per_zspage = objs_per_zspage; spin_lock_init(&class->lock); pool->size_class[i] = class; + for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; + fullness++) + INIT_LIST_HEAD(&class->fullness_list[fullness]); prev_class = class; } @@ -1955,6 +2475,9 @@ struct zs_pool *zs_create_pool(const char *name) /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); + if (zs_register_migration(pool)) + goto err; + /* * Not critical, we still can use the pool * and user can trigger compaction manually. @@ -1974,6 +2497,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); + zs_unregister_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < zs_size_classes; i++) { @@ -1986,8 +2510,8 @@ void zs_destroy_pool(struct zs_pool *pool) if (class->index != i) continue; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { + for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { + if (!list_empty(&class->fullness_list[fg])) { pr_info("Freeing non-empty class with size %db, fullness group %d\n", class->size, fg); } @@ -1995,7 +2519,7 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } - destroy_handle_cache(pool); + destroy_cache(pool); kfree(pool->size_class); kfree(pool->name); kfree(pool); @@ -2004,7 +2528,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { - int ret = zs_register_cpu_notifier(); + int ret; + + ret = zsmalloc_mount(); + if (ret) + goto out; + + ret = zs_register_cpu_notifier(); if (ret) goto notifier_fail; @@ -2021,7 +2551,8 @@ static int __init zs_init(void) notifier_fail: zs_unregister_cpu_notifier(); - + zsmalloc_unmount(); +out: return ret; } @@ -2030,6 +2561,7 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif + zsmalloc_unmount(); zs_unregister_cpu_notifier(); zs_stat_exit(); |