diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 21 | ||||
-rw-r--r-- | mm/compaction.c | 158 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 37 | ||||
-rw-r--r-- | mm/internal.h | 9 | ||||
-rw-r--r-- | mm/memcontrol.c | 38 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 21 | ||||
-rw-r--r-- | mm/mempolicy.c | 63 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 5 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 47 | ||||
-rw-r--r-- | mm/oom_kill.c | 112 | ||||
-rw-r--r-- | mm/page-writeback.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 885 | ||||
-rw-r--r-- | mm/page_isolation.c | 10 | ||||
-rw-r--r-- | mm/page_owner.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 130 | ||||
-rw-r--r-- | mm/slab.c | 771 | ||||
-rw-r--r-- | mm/slub.c | 16 | ||||
-rw-r--r-- | mm/swap_state.c | 3 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmscan.c | 27 | ||||
-rw-r--r-- | mm/vmstat.c | 112 |
27 files changed, 1556 insertions, 982 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 989f8f3d77e0..b0432b71137d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE def_bool y depends on SPARSEMEM && MEMORY_HOTPLUG +config MEMORY_HOTPLUG_DEFAULT_ONLINE + bool "Online the newly added memory blocks by default" + default n + depends on MEMORY_HOTPLUG + help + This option sets the default policy setting for memory hotplug + onlining policy (/sys/devices/system/memory/auto_online_blocks) which + determines what happens to newly added memory regions. Policy setting + can always be changed at runtime. + See Documentation/memory-hotplug.txt for more information. + + Say Y here if you want all hot-plugged memory blocks to appear in + 'online' state by default. + Say N here if you want the default policy to keep all hot-plugged + memory blocks in 'offline' state. + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select MEMORY_ISOLATION @@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -config ZONE_DMA_FLAG - int - default "0" if !ZONE_DMA - default "1" - config BOUNCE bool "Enable bounce buffers" default y diff --git a/mm/compaction.c b/mm/compaction.c index 8fa254043801..eda3c2244f30 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) +#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) +#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone) zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = - round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); + pageblock_start_pfn(zone_end_pfn(zone) - 1); } /* @@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc, LIST_HEAD(freelist); pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn += isolated, block_start_pfn = block_end_pfn, @@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); } @@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; - struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; /* * Ensure that there are not too many pages isolated from the LRU @@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (compact_should_abort(cc)) return 0; + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { bool is_lru; + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, break; if (!pfn_valid_within(low_pfn)) - continue; + goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (likely(comp_order < MAX_ORDER)) low_pfn += (1UL << comp_order) - 1; - continue; + goto isolate_fail; } if (!is_lru) - continue; + goto isolate_fail; /* * Migration will fail if an anonymous page is pinned in memory, @@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!page_mapping(page) && page_count(page) > page_mapcount(page)) - continue; + goto isolate_fail; /* If we already hold the lock, we can skip some rechecking */ if (!locked) { @@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) - continue; + goto isolate_fail; /* * Page become compound since the non-locked check, @@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page))) { low_pfn += (1UL << compound_order(page)) - 1; - continue; + goto isolate_fail; } } @@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) - continue; + goto isolate_fail; VM_BUG_ON_PAGE(PageCompound(page), page); @@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - list_add(&page->lru, migratelist); + list_add(&page->lru, &cc->migratepages); cc->nr_migratepages++; nr_isolated++; + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that was isolated and likely be + * then freed by migration. + */ + if (!cc->last_migrated_pfn) + cc->last_migrated_pfn = low_pfn; + /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; break; } + + continue; +isolate_fail: + if (!skip_on_failure) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } + acct_isolated(zone, cc); + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + cc->last_migrated_pfn = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } } /* @@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn = block_end_pfn, block_start_pfn = block_end_pfn, @@ -924,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = pageblock_start_pfn(cc->free_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); - low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); /* * Isolate free pages until enough are available to migrate the @@ -1070,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long block_start_pfn; unsigned long block_end_pfn; unsigned long low_pfn; - unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1081,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; - block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(low_pfn); /* * Iterate over whole pageblocks until we find the first suitable. @@ -1125,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ - isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); @@ -1135,15 +1206,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (cc->nr_migratepages && !cc->last_migrated_pfn) - cc->last_migrated_pfn = isolate_start_pfn; - - /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. @@ -1251,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_CONTINUE - If compaction should run now */ static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { int fragindex; unsigned long watermark; @@ -1296,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, } unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { unsigned long ret; @@ -1343,7 +1407,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { @@ -1398,6 +1462,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = COMPACT_CONTENDED; goto out; } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + cc->last_migrated_pfn = 0; + + } } check_drain: @@ -1411,7 +1487,7 @@ check_drain: if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = - cc->migrate_pfn & ~((1UL << cc->order) - 1); + block_start_pfn(cc->migrate_pfn, cc->order); if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); @@ -1436,7 +1512,7 @@ out: cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ - free_pfn &= ~(pageblock_nr_pages-1); + free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() @@ -1456,7 +1532,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1497,8 +1573,8 @@ int sysctl_extfrag_threshold = 500; * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; @@ -1526,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, - ac->classzone_idx); + ac_classzone_idx(ac)); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1536,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/filemap.c b/mm/filemap.c index 182b21825255..01690338e3d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) * some other bad page check should catch it later. */ page_mapcount_reset(page); - atomic_sub(mapcount, &page->_count); + page_ref_sub(page, mapcount); } } diff --git a/mm/highmem.c b/mm/highmem.c index 123bcd3ed4f2..50b4ca6787f0 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); unsigned int nr_free_highpages (void) { - pg_data_t *pgdat; + struct zone *zone; unsigned int pages = 0; - for_each_online_pgdat(pgdat) { - pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); - if (zone_movable_is_highmem()) - pages += zone_page_state( - &pgdat->node_zones[ZONE_MOVABLE], - NR_FREE_PAGES); + for_each_populated_zone(zone) { + if (is_highmem(zone)) + pages += zone_page_state(zone, NR_FREE_PAGES); } return pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b49ee126d4d1..66675eed67be 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* @@ -3113,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3340,7 +3337,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..949d80609a32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages); static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; +static bool __initdata parsed_valid_hugepagesz = true; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } } - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on @@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else @@ -937,9 +940,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; @@ -1030,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); } -static bool pfn_range_valid_gigantic(unsigned long start_pfn, - unsigned long nr_pages) +static bool pfn_range_valid_gigantic(struct zone *z, + unsigned long start_pfn, unsigned long nr_pages) { unsigned long i, end_pfn = start_pfn + nr_pages; struct page *page; @@ -1042,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn, page = pfn_to_page(i); + if (page_zone(page) != z) + return false; + if (PageReserved(page)) return false; @@ -1074,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) pfn = ALIGN(z->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(pfn, nr_pages)) { + if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -2659,6 +2663,11 @@ static int __init hugetlb_init(void) subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_bad_size(void) +{ + parsed_valid_hugepagesz = false; +} + void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -2678,8 +2687,8 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_MEMORY]); + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -2691,11 +2700,17 @@ static int __init hugetlb_nrpages_setup(char *s) unsigned long *mhp; static unsigned long *last_mhp; + if (!parsed_valid_hugepagesz) { + pr_warn("hugepages = %s preceded by " + "an unsupported hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!hugetlb_max_hstate) + else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; diff --git a/mm/internal.h b/mm/internal.h index b79abb6721cf..3ac544f1963f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) @@ -102,13 +102,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; - struct zone *preferred_zone; - int classzone_idx; + struct zoneref *preferred_zoneref; int migratetype; enum zone_type high_zoneidx; bool spread_dirty_pages; }; +#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -175,7 +176,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const int alloc_flags; /* alloc flags of a direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe787f5c41bd..d71d387868e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1023,22 +1023,40 @@ out: * @lru: index of lru list the page is sitting on * @nr_pages: positive when adding or negative when removing * - * This function must be called when a page is added to or removed from an - * lru list. + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list (that ordering being + * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; + long size; + bool empty; + + __update_lru_size(lruvec, lru, nr_pages); if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); lru_size = mz->lru_size + lru; - *lru_size += nr_pages; - VM_BUG_ON((long)(*lru_size) < 0); + empty = list_empty(lruvec->lists + lru); + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0 || empty != !size, + "%s(%p, %d, %d): lru_size %ld but %sempty\n", + __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) @@ -1257,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); + try_oom_reaper(current); goto unlock; } @@ -1389,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* - * We call this when we hit limit, not when pages are added to LRU. - * No LRU may hold pages because all pages are UNEVICTABLE or - * memcg is too small and all pages are not on LRU. In that case, - * we use curret node. + * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages + * last time it really checked all the LRUs due to rate limiting. + * Fallback to the current node in that case for simplicity. */ if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa34431c3f31..caf2a14c37ad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -78,9 +78,24 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; +#else +bool memhp_auto_online = true; +#endif EXPORT_SYMBOL_GPL(memhp_auto_online); +static int __init setup_memhp_default_state(char *str) +{ + if (!strcmp(str, "online")) + memhp_auto_online = true; + else if (!strcmp(str, "offline")) + memhp_auto_online = false; + + return 1; +} +__setup("memhp_default_state=", setup_memhp_default_state); + void get_online_mems(void) { might_sleep(); @@ -1410,7 +1425,7 @@ static struct page *next_active_pageblock(struct page *page) } /* Checks if this range of memory is likely to be hot-removable. */ -int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; @@ -1418,12 +1433,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { if (!is_pageblock_removable_nolock(page)) - return 0; + return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ - return 1; + return true; } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01bc950a..297d6854f849 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> -#include <linux/random.h> #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void) return interleave_nodes(policy); case MPOL_BIND: { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : node; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes); + return z->zone ? z->zone->node : node; } default: @@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void) } } -/* Do static interleaving for a VMA with known offset. */ +/* + * Do static interleaving for a VMA with known offset @n. Returns the n'th + * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the + * number of present nodes. + */ static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) + struct vm_area_struct *vma, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; - int c; - int nid = NUMA_NO_NODE; + int i; + int nid; if (!nnodes) return numa_node_id(); - target = (unsigned int)off % nnodes; - c = 0; - do { + target = (unsigned int)n % nnodes; + nid = first_node(pol->v.nodes); + for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); - c++; - } while (c <= target); return nid; } @@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) @@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; - struct zone *zone; + struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); @@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* * allows binding to multiple nodes. * use current page if in policy nodemask, @@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ if (node_isset(curnid, pol->v.nodes)) goto out; - (void)first_zones_zonelist( + z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->v.nodes); + polnid = z->zone->node; break; default: diff --git a/mm/migrate.c b/mm/migrate.c index f9dfb18a4eba..53ab6398e7a2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); return MIGRATEPAGE_SUCCESS; } @@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { @@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/mmap.c b/mm/mmap.c index bd2e1a533bc1..fba246b8f1a5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -55,10 +55,6 @@ #define arch_mmap_check(addr, len, flags) (0) #endif -#ifndef arch_rebalance_pgtables -#define arch_rebalance_pgtables(addr, len) (addr) -#endif - #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; @@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (offset_in_page(addr)) return -EINVAL; - addr = arch_rebalance_pgtables(addr, len); error = security_mmap_addr(addr); return error ? error : addr; } diff --git a/mm/mmzone.c b/mm/mmzone.c index 52687fb4de6f..5652be858e5e 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) } /* Returns the next zone at or below highest_zoneidx in a zonelist */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { diff --git a/mm/mremap.c b/mm/mremap.c index 3fa0a467df66..9dc499977924 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } +static void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +static void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + static pte_t move_soft_dirty_pte(pte_t pte) { /* @@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { - struct address_space *mapping = NULL; - struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ - if (need_rmap_locks) { - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - } - if (vma->anon_vma) { - anon_vma = vma->anon_vma; - anon_vma_lock_write(anon_vma); - } - } + if (need_rmap_locks) + take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst @@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); - if (anon_vma) - anon_vma_unlock_write(anon_vma); - if (mapping) - i_mmap_unlock_write(mapping); + if (need_rmap_locks) + drop_rmap_locks(vma); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; - VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, - vma); /* See comment in move_ptes() */ if (need_rmap_locks) - anon_vma_lock_write(vma->anon_vma); - moved = move_huge_pmd(vma, new_vma, old_addr, - new_addr, old_end, - old_pmd, new_pmd); + take_rmap_locks(vma); + moved = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); if (need_rmap_locks) - anon_vma_unlock_write(vma->anon_vma); + drop_rmap_locks(vma); if (moved) { need_flush = true; continue; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86349586eacb..415f7eb913fa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -491,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore. OOM killer can continue - * by selecting other victim if unmapping hasn't led to any - * improvements. This also means that selecting this task doesn't - * make any sense. + * This task can be safely ignored because we cannot do much more + * to release its memory. */ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; - exit_oom_victim(tsk); out: mmput(mm); return ret; @@ -519,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); } + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore or it is not a good candidate + * for the oom victim right now because it cannot release its memory + * itself nor by the oom reaper. + */ + tsk->oom_reaper_list = NULL; + exit_oom_victim(tsk); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } @@ -563,6 +587,53 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + bool exiting; + + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, tsk)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + spin_lock_irq(&p->sighand->siglock); + exiting = signal_group_exit(p->signal); + spin_unlock_irq(&p->sighand->siglock); + if (exiting) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -653,24 +724,6 @@ void oom_killer_enable(void) } /* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - -/* * Must be called while holding a reference to p, which will be released upon * returning. */ @@ -694,6 +747,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,10 +927,20 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. + */ + if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + return true; + + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bc5149d5ec38..3b88795ab46e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; + int i; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &NODE_DATA(node)->node_zones[i]; - x += zone_dirtyable_memory(z); + if (is_highmem(z)) + x += zone_dirtyable_memory(z); + } } /* * Unreclaimable memory (kernel memory or anonymous memory diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069efcc4d7..5c469c1dfb8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -784,17 +884,42 @@ out: zone->free_area[order].nr_free++; } -static inline int free_pages_check(struct page *page) +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + (unsigned long)page->mem_cgroup | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + +static void free_pages_check_bad(struct page *page) +{ + const char *bad_reason; + unsigned long bad_flags; + + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -803,16 +928,146 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; + bad_page(page, bad_reason, bad_flags); +} + +static inline int free_pages_check(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) + return 0; + + /* Something has gone sideways, find it */ + free_pages_check_bad(page); + return 1; +} + +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free) +{ + int bad = 0; + + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } + } + if (PageAnonHead(page)) + page->mapping = NULL; + if (check_free) + bad += free_pages_check(page); + if (bad) + return false; + page_cpupid_reset_last(page); - if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); + + return true; } +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, true); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, false); +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_pages_check(page); +} +#endif /* CONFIG_DEBUG_VM */ + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -829,15 +1084,16 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int to_free = count; unsigned long nr_scanned; + bool isolated_pageblocks; spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - while (to_free) { + while (count) { struct page *page; struct list_head *list; @@ -857,7 +1113,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) - batch_free = to_free; + batch_free = count; do { int mt; /* migratetype of the to-be-freed page */ @@ -870,12 +1126,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ - if (unlikely(has_isolate_pageblock(zone))) + if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); + if (bulkfree_pcp_prepare(page)) + continue; + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - } while (--to_free && --batch_free && !list_empty(list)); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -899,56 +1158,6 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } -static int free_tail_pages_check(struct page *head_page, struct page *page) -{ - int ret = 1; - - /* - * We rely page->lru.next never has bit 0 set, unless the page - * is PageTail(). Let's make sure that's true even for poisoned ->lru. - */ - BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) { - ret = 0; - goto out; - } - switch (page - head_page) { - case 1: - /* the first tail page: ->mapping is compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); - goto out; - } - break; - case 2: - /* - * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. - */ - break; - default: - if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; - } - break; - } - if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); - goto out; - } - if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); - goto out; - } - ret = 0; -out: - page->mapping = NULL; - clear_compound_head(page); - return ret; -} - static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { @@ -1022,51 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) } } -static bool free_pages_prepare(struct page *page, unsigned int order) -{ - bool compound = PageCompound(page); - int i, bad = 0; - - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - - trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - - if (PageAnon(page)) - page->mapping = NULL; - bad += free_pages_check(page); - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); - } - if (bad) - return false; - - reset_page_owner(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE << order); - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - - return true; -} - static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order)) + if (!free_pages_prepare(page, order, true)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1076,8 +1247,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1154,7 +1324,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, pfn, order); + return __free_pages_boot_core(page, order); } /* @@ -1239,12 +1409,12 @@ static void __init deferred_free_range(struct page *page, if (nr_pages == MAX_ORDER_NR_PAGES && (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pfn, MAX_ORDER-1); + __free_pages_boot_core(page, MAX_ORDER-1); return; } - for (i = 0; i < nr_pages; i++, page++, pfn++) - __free_pages_boot_core(page, pfn, 0); + for (i = 0; i < nr_pages; i++, page++) + __free_pages_boot_core(page, 0); } /* Completion tracking for deferred_init_memmap() threads */ @@ -1477,10 +1647,7 @@ static inline void expand(struct zone *zone, struct page *page, } } -/* - * This page is about to be returned from the page allocator - */ -static inline int check_new_page(struct page *page) +static void check_new_page_bad(struct page *page) { const char *bad_reason = NULL; unsigned long bad_flags = 0; @@ -1503,11 +1670,20 @@ static inline int check_new_page(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - return 0; + bad_page(page, bad_reason, bad_flags); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; } static inline bool free_pages_prezeroed(bool poisoned) @@ -1516,16 +1692,48 @@ static inline bool free_pages_prezeroed(bool poisoned) page_poisoning_enabled() && poisoned; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) +#ifdef CONFIG_DEBUG_VM +static bool check_pcp_refill(struct page *page) +{ + return false; +} + +static bool check_new_pcp(struct page *page) +{ + return check_new_page(page); +} +#else +static bool check_pcp_refill(struct page *page) +{ + return check_new_page(page); +} +static bool check_new_pcp(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { int i; bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; - if (unlikely(check_new_page(p))) - return 1; if (poisoned) poisoned &= page_is_poisoned(p); } @@ -1557,8 +1765,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); - - return 0; } /* @@ -1980,6 +2186,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + if (unlikely(check_pcp_refill(page))) + continue; + /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and @@ -2157,6 +2366,10 @@ void mark_free_pages(struct zone *zone) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } @@ -2187,7 +2400,7 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_pages_prepare(page, 0)) + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -2343,12 +2556,44 @@ int split_free_page(struct page *page) } /* + * Update NUMA hit/miss statistics + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) +{ +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif +} + +/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2359,21 +2604,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct list_head *list; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } + do { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + } while (page && check_new_pcp(page)); + __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; } else { @@ -2384,22 +2632,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) set_bit(ZONE_FAIR_DEPLETED, &zone->flags); @@ -2501,12 +2751,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, + unsigned long mark, int classzone_idx, + unsigned int alloc_flags, long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2569,12 +2820,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, unsigned int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. There is a corner case where the check + * passes but only the high-order atomic reserve are free. If + * the caller is !atomic then it'll uselessly search the free + * list. That corner case is then slower but it is harmless. + */ + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + return true; + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx) { @@ -2630,27 +2907,24 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zonelist *zonelist = ac->zonelist; - struct zoneref *z; - struct page *page = NULL; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - int nr_fair_skipped = 0; - bool zonelist_rescan; + bool fair_skipped = false; + bool apply_fair = (alloc_flags & ALLOC_FAIR); zonelist_scan: - zonelist_rescan = false; - /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { + struct page *page; unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual @@ -2658,13 +2932,16 @@ zonelist_scan: * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. */ - if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(ac->preferred_zone, zone)) - break; + if (apply_fair) { if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - nr_fair_skipped++; + fair_skipped = true; continue; } + if (!zone_local(ac->preferred_zoneref->zone, zone)) { + if (fair_skipped) + goto reset_fair; + apply_fair = false; + } } /* * When allocating a page cache page for writing, we @@ -2696,8 +2973,8 @@ zonelist_scan: continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) { + if (!zone_watermark_fast(zone, order, mark, + ac_classzone_idx(ac), alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2706,7 +2983,7 @@ zonelist_scan: goto try_this_zone; if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2720,7 +2997,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) + ac_classzone_idx(ac), alloc_flags)) goto try_this_zone; continue; @@ -2728,11 +3005,10 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { - if (prep_new_page(page, order, gfp_mask, alloc_flags)) - goto try_this_zone; + prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check @@ -2753,18 +3029,13 @@ try_this_zone: * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ - if (alloc_flags & ALLOC_FAIR) { - alloc_flags &= ~ALLOC_FAIR; - if (nr_fair_skipped) { - zonelist_rescan = true; - reset_alloc_batches(ac->preferred_zone); - } - if (nr_online_nodes > 1) - zonelist_rescan = true; - } - - if (zonelist_rescan) + if (fair_skipped) { +reset_fair: + apply_fair = false; + fair_skipped = false; + reset_alloc_batches(ac->preferred_zoneref->zone); goto zonelist_scan; + } return NULL; } @@ -2872,22 +3143,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for IO-less reclaim */ - if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - * - * But do not keep looping if oom_killer_disable() - * was already called, for the system is trying to - * enter a quiescent state during suspend. - */ - *did_some_progress = !oom_killer_disabled; - goto out; - } if (pm_suspended_storage()) goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; @@ -2917,7 +3184,7 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -2973,7 +3240,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3013,7 +3280,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3049,13 +3316,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + wakeup_kswapd(zone, order, ac_classzone_idx(ac)); } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3116,7 +3383,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; + unsigned int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; @@ -3153,17 +3420,6 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. - */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3278,7 +3534,7 @@ retry: if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); goto retry; } @@ -3316,17 +3572,24 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; - struct page *page = NULL; + struct page *page; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_mask |= __GFP_HARDWALL; + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3350,49 +3613,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); - if (!ac.preferred_zone) - goto out; - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { + page = NULL; + goto no_zone; + } /* First allocation attempt */ - alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); - if (unlikely(!page)) { - /* - * Runtime PM, block IO and its error handling path - * can deadlock because I/O on the device might not - * complete. - */ - alloc_mask = memalloc_noio_flags(gfp_mask); - ac.spread_dirty_pages = false; - - page = __alloc_pages_slowpath(alloc_mask, order, &ac); - } + if (likely(page)) + goto out; - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); + /* + * Runtime PM, block IO and its error handling path can deadlock + * because I/O on the device might not complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + if (cpusets_enabled()) + ac.nodemask = nodemask; + page = __alloc_pages_slowpath(alloc_mask, order, &ac); -out: +no_zone: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { + alloc_mask = gfp_mask; goto retry_cpuset; + } + +out: + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; } @@ -3790,6 +4058,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) @@ -3798,12 +4068,19 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; - val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone->managed_pages; + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #else - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } @@ -4390,13 +4667,12 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif @@ -6725,98 +7001,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct zone *zone, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return zone->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages @@ -6864,7 +7048,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) @@ -7177,7 +7361,8 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be isolated before calling this. + * All pages in the range must be in a single zone and isolated + * before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f568206544..612122bf6a42 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } +/* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { @@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/page_owner.c b/mm/page_owner.c index ac3d8d129974..792b56da13d8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; /* Print information relevant to grouping pages by mobility */ - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", @@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + /* * We are safe to check buddy flag and order, because * this is init stage and only single thread runs. diff --git a/mm/rmap.c b/mm/rmap.c index 307b555024ef..8a839935b18c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - BUG_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->degree); put_anon_vma(anon_vma); list_del(&avc->same_vma); @@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page, int nr = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - SetPageSwapBacked(page); + __SetPageSwapBacked(page); if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ diff --git a/mm/shmem.c b/mm/shmem.c index e684a9140228..e418a995427d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,7 +101,6 @@ struct shmem_falloc { enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; @@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); static inline int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, int *fault_type) + struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. + * pages are allocated, in order to allow large sparse files. * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ @@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (partial_start) { struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + shmem_getpage(inode, start - 1, &page, SGP_READ); if (page) { unsigned int top = PAGE_SIZE; if (start > end) { @@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } if (partial_end) { struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ, NULL); + shmem_getpage(inode, end, &page, SGP_READ); if (page) { zero_user_segment(page, 0, partial_end); set_page_dirty(page); @@ -947,8 +947,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) @@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + } /* Drop reference taken by mpol_shared_policy_lookup() */ mpol_cond_put(pvma.vm_policy); return page; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -{ -} -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return swapin_readahead(swap, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ - -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) -{ - return NULL; -} -#endif /* * When a page is moved from swapcache to shmem filecache (either by the @@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __SetPageLocked(newpage); SetPageUptodate(newpage); - SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap - * entry since a page cannot live in both the swap and page cache + * entry since a page cannot live in both the swap and page cache. + * + * fault_mm and fault_type are only supplied by shmem_fault: + * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) + struct page **pagep, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; @@ -1155,7 +1146,7 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; @@ -1183,14 +1174,19 @@ repeat: */ info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = fault_mm ? : current->mm; if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); if (!page) { - /* here we actually do the io */ - if (fault_type) + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + } + /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); if (!page) { error = -ENOMEM; @@ -1217,7 +1213,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1275,13 +1271,10 @@ repeat: error = -ENOMEM; goto decused; } - - __SetPageSwapBacked(page); - __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (error) goto decused; @@ -1321,12 +1314,10 @@ clear: flush_dcache_page(page); SetPageUptodate(page); } - if (sgp == SGP_DIRTY) - set_page_dirty(page); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); @@ -1372,6 +1363,7 @@ unlock: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); int error; int ret = VM_FAULT_LOCKED; @@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - - if (ret & VM_FAULT_MAJOR) { - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - } return ret; } @@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + return shmem_getpage(inode, index, pagep, SGP_WRITE); } static int @@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (!iter_is_iovec(to)) - sgp = SGP_DIRTY; + sgp = SGP_CACHE; index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp, NULL); + error = shmem_getpage(inode, index, &page, sgp); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) + if (page) { + if (sgp == SGP_CACHE) + set_page_dirty(page); unlock_page(page); + } /* * We must evaluate after, since reads (unlike writes) @@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, - SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC, - NULL); + error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, @@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + error = shmem_getpage(inode, 0, &page, SGP_WRITE); if (error) { iput(inode); return error; @@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); unlock_page(page); @@ -3496,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, int error; BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + gfp, NULL, NULL); if (error) page = ERR_PTR(error); else diff --git a/mm/slab.c b/mm/slab.c index 17e2848979c5..c11bf5007952 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list); +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list); static int slab_early_init = 1; #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) @@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } @@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) @@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags) } #endif +static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) +{ + struct kmem_cache_node *n; + + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (n) { + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; + spin_unlock_irq(&n->list_lock); + + return 0; + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) + return -ENOMEM; + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + n->free_limit = + (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient + * protection here. + */ + cachep->node[node] = n; + + return 0; +} + /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags) */ static int init_cache_node_node(int node) { + int ret; struct kmem_cache *cachep; - struct kmem_cache_node *n; - const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (!n) { - n = kmalloc_node(memsize, GFP_KERNEL, node); - if (!n) - return -ENOMEM; - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex is sufficient - * protection here. - */ - cachep->node[node] = n; - } - - spin_lock_irq(&n->list_lock); - n->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + ret = init_cache_node(cachep, node, GFP_KERNEL); + if (ret) + return ret; } + return 0; } -static inline int slabs_tofree(struct kmem_cache *cachep, - struct kmem_cache_node *n) +static int setup_kmem_cache_node(struct kmem_cache *cachep, + int node, gfp_t gfp, bool force_change) { - return (n->free_objects + cachep->num - 1) / cachep->num; + int ret = -ENOMEM; + struct kmem_cache_node *n; + struct array_cache *old_shared = NULL; + struct array_cache *new_shared = NULL; + struct alien_cache **new_alien = NULL; + LIST_HEAD(list); + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); + if (!new_shared) + goto fail; + } + + ret = init_cache_node(cachep, node, gfp); + if (ret) + goto fail; + + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); + n->shared->avail = 0; + } + + if (!n->shared || force_change) { + old_shared = n->shared; + n->shared = new_shared; + new_shared = NULL; + } + + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + + /* + * To protect lockless access to n->shared during irq disabled context. + * If n->shared isn't NULL in irq disabled context, accessing to it is + * guaranteed to be valid until irq is re-enabled, because it will be + * freed after synchronize_sched(). + */ + if (force_change) + synchronize_sched(); + +fail: + kfree(old_shared); + kfree(new_shared); + free_alien_cache(new_alien); + + return ret; } static void cpuup_canceled(long cpu) @@ -967,14 +1039,13 @@ free_slab: n = get_node(cachep, node); if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); } } static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *shared = NULL; - struct alien_cache **alien = NULL; - - if (cachep->shared) { - shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, - 0xbaadf00d, GFP_KERNEL); - if (!shared) - goto bad; - } - if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); - if (!alien) { - kfree(shared); - goto bad; - } - } - n = get_node(cachep, node); - BUG_ON(!n); - - spin_lock_irq(&n->list_lock); - if (!n->shared) { - /* - * We are serialised from CPU_DEAD or - * CPU_UP_CANCELLED by the cpucontrol lock - */ - n->shared = shared; - shared = NULL; - } -#ifdef CONFIG_NUMA - if (!n->alien) { - n->alien = alien; - alien = NULL; - } -#endif - spin_unlock_irq(&n->list_lock); - kfree(shared); - free_alien_cache(alien); + err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); + if (err) + goto bad; } return 0; @@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node) if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); if (!list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial)) { @@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + unsigned int seed, count = cachep->num; + struct rnd_state state; + + if (count < 2) + return 0; + + /* If it fails, we will just use the global lists */ + cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage */ + get_random_bytes_arch(&seed, sizeof(seed)); + prandom_seed_state(&state, seed); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +static void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void) sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - if (num_possible_nodes() == 1) + if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) @@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, /* * Needed to avoid possible looping condition - * in cache_grow() + * in cache_grow_begin() */ if (OFF_SLAB(freelist_cache)) continue; @@ -2138,7 +2229,7 @@ done: cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; - if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2180,6 +2271,11 @@ static void check_irq_on(void) BUG_ON(irqs_disabled()); } +static void check_mutex_acquired(void) +{ + BUG_ON(!mutex_is_locked(&slab_mutex)); +} + static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP @@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) +#define check_mutex_acquired() do { } while(0) #define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, - int force, int node); +static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, + int node, bool free_all, struct list_head *list) +{ + int tofree; + + if (!ac || !ac->avail) + return; + + tofree = free_all ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + + free_block(cachep, ac->entry, tofree, node, list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); +} static void do_drain(void *arg) { @@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_cache_node *n; int node; + LIST_HEAD(list); on_each_cpu(do_drain, cachep, 1); check_irq_on(); @@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep) if (n->alien) drain_alien_cache(cachep, n->alien); - for_each_kmem_cache_node(cachep, node, n) - drain_array(cachep, n, n->shared, 1, node); + for_each_kmem_cache_node(cachep, node, n) { + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } } /* @@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) check_irq_on(); for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); ret += !list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial); @@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep) int i; struct kmem_cache_node *n; + cache_random_seq_destroy(cachep); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ @@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) #endif } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + freelist_idx_t *list; + unsigned int count; + unsigned int rand; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization methode available. + * return true if the pre-computed list is available, false otherwize. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + get_random_bytes_arch(&rand, sizeof(rand)); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = 0; + state->rand = rand; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + return (state->list[state->pos++] + state->rand) % state->count; +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +{ + unsigned int objfreelist = 0, i, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + page->freelist = index_to_obj(cachep, page, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + freelist_randomize(&state.rnd_state, page->freelist, count); + } else { + for (i = 0; i < count; i++) + set_free_obj(page, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(page, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; void *objp; + bool shuffled; cache_init_objs_debug(cachep, page); - if (OBJFREELIST_SLAB(cachep)) { + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, page); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { page->freelist = index_to_obj(cachep, page, cachep->num - 1) + obj_offset(cachep); } @@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep, kasan_poison_object_data(cachep, objp); } - set_free_obj(page, i, i); - } -} - -static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) -{ - if (CONFIG_ZONE_DMA_FLAG) { - if (flags & GFP_DMA) - BUG_ON(!(cachep->allocflags & GFP_DMA)); - else - BUG_ON(cachep->allocflags & GFP_DMA); + if (!shuffled) + set_free_obj(page, i, i); } } @@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, struct page *page) +static struct page *cache_grow_begin(struct kmem_cache *cachep, + gfp_t flags, int nodeid) { void *freelist; size_t offset; gfp_t local_flags; + int page_node; struct kmem_cache_node *n; + struct page *page; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep, } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = get_node(cachep, nodeid); - spin_lock(&n->list_lock); - - /* Get colour for the slab, and cal the next value. */ - offset = n->colour_next; - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - spin_unlock(&n->list_lock); - - offset *= cachep->colour_off; - if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* - * The test for missing atomic flag is performed here, rather than - * the more obvious place, simply to reduce the critical path length - * in kmem_cache_alloc(). If a caller is seriously mis-behaving they - * will eventually be caught here (where it matters). - */ - kmem_flagcheck(cachep, flags); - - /* * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!page) - page = kmem_getpages(cachep, local_flags, nodeid); + page = kmem_getpages(cachep, local_flags, nodeid); if (!page) goto failed; + page_node = page_to_nid(page); + n = get_node(cachep, page_node); + + /* Get colour for the slab, and cal the next value. */ + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + + offset = n->colour_next; + if (offset >= cachep->colour) + offset = 0; + + offset *= cachep->colour_off; + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, - local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, page_node); if (OFF_SLAB(cachep) && !freelist) goto opps1; @@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep, if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - check_irq_off(); - spin_lock(&n->list_lock); - /* Make slab active. */ - list_add_tail(&page->lru, &(n->slabs_free)); - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num; - spin_unlock(&n->list_lock); - return 1; + return page; + opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return 0; + return NULL; +} + +static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +{ + struct kmem_cache_node *n; + void *list = NULL; + + check_irq_off(); + + if (!page) + return; + + INIT_LIST_HEAD(&page->lru); + n = get_node(cachep, page_to_nid(page)); + + spin_lock(&n->list_lock); + if (!page->active) + list_add_tail(&page->lru, &(n->slabs_free)); + else + fixup_slab_list(cachep, n, page, &list); + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; + spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); } #if DEBUG @@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, return obj; } +/* + * Slab list should be fixed up by fixup_slab_list() for existing slab + * or cache_grow_end() for new slab + */ +static __always_inline int alloc_block(struct kmem_cache *cachep, + struct array_cache *ac, struct page *page, int batchcount) +{ + /* + * There must be at least one object available for + * allocation. + */ + BUG_ON(page->active >= cachep->num); + + while (page->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, page); + } + + return batchcount; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; - struct array_cache *ac; + struct array_cache *ac, *shared; int node; void *list = NULL; + struct page *page; check_irq_off(); node = numa_mem_id(); -retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2810,16 +3056,20 @@ retry: n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); + shared = READ_ONCE(n->shared); + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ - if (n->shared && transfer_objects(ac, n->shared, batchcount)) { - n->shared->touched = 1; + if (shared && transfer_objects(ac, shared, batchcount)) { + shared->touched = 1; goto alloc_done; } while (batchcount > 0) { - struct page *page; /* Get slab alloc is to come from. */ page = get_first_slab(n, false); if (!page) @@ -2827,21 +3077,7 @@ retry: check_spinlock_acquired(cachep); - /* - * The slab was either on partial or free list so - * there must be at least one object available for - * allocation. - */ - BUG_ON(page->active >= cachep->num); - - while (page->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, page); - } - + batchcount = alloc_block(cachep, ac, page, batchcount); fixup_slab_list(cachep, n, page, &list); } @@ -2851,9 +3087,8 @@ alloc_done: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); +direct_grow: if (unlikely(!ac->avail)) { - int x; - /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { void *obj = cache_alloc_pfmemalloc(cachep, n, flags); @@ -2862,18 +3097,19 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); + page = cache_grow_begin(cachep, gfp_exact_node(flags), node); - /* cache_grow can reenable interrupts, then ac could change. */ + /* + * cache_grow_begin() can reenable interrupts, + * then ac could change. + */ ac = cpu_cache_get(cachep); - node = numa_mem_id(); + if (!ac->avail && page) + alloc_block(cachep, ac, page, batchcount); + cache_grow_end(cachep, page); - /* no objects in sight? abort */ - if (!x && ac->avail == 0) + if (!ac->avail) return NULL; - - if (!ac->avail) /* objects refilled by interrupt? */ - goto retry; } ac->touched = 1; @@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { might_sleep_if(gfpflags_allow_blocking(flags)); -#if DEBUG - kmem_flagcheck(cachep, flags); -#endif } #if DEBUG @@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { struct zonelist *zonelist; - gfp_t local_flags; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; + struct page *page; int nid; unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); @@ -3040,33 +3271,19 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - struct page *page; - - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - kmem_flagcheck(cache, flags); - page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); + page = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, page); if (page) { + nid = page_to_nid(page); + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + /* - * Insert into the appropriate per node queues + * Another processor may allocate the objects in + * the slab since we are not holding any locks. */ - nid = page_to_nid(page); - if (cache_grow(cache, flags, nid, page)) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (!obj) - /* - * Another processor may allocate the - * objects in the slab since we are - * not holding any locks. - */ - goto retry; - } else { - /* cache_grow already freed obj */ - obj = NULL; - } + if (!obj) + goto retry; } } @@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, { struct page *page; struct kmem_cache_node *n; - void *obj; + void *obj = NULL; void *list = NULL; - int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); -retry: check_irq_off(); spin_lock(&n->list_lock); page = get_first_slab(n, false); @@ -3113,18 +3328,18 @@ retry: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); - goto done; + return obj; must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); - if (x) - goto retry; - - return fallback_alloc(cachep, flags); + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ + obj = slab_get_obj(cachep, page); + } + cache_grow_end(cachep, page); -done: - return obj; + return obj ? obj : fallback_alloc(cachep, flags); } static __always_inline void * @@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp, { int i; struct kmem_cache_node *n = get_node(cachep, node); + struct page *page; + + n->free_objects += nr_objects; for (i = 0; i < nr_objects; i++) { void *objp; @@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); - n->free_objects++; /* fixup slab chains */ - if (page->active == 0) { - if (n->free_objects > n->free_limit) { - n->free_objects -= cachep->num; - list_add_tail(&page->lru, list); - } else { - list_add(&page->lru, &n->slabs_free); - } - } else { + if (page->active == 0) + list_add(&page->lru, &n->slabs_free); + else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp, list_add_tail(&page->lru, &n->slabs_partial); } } + + while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { + n->free_objects -= cachep->num; + + page = list_last_entry(&n->slabs_free, struct page, lru); + list_del(&page->lru); + list_add(&page->lru, list); + } } static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) @@ -3645,72 +3865,19 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) +static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) { + int ret; int node; struct kmem_cache_node *n; - struct array_cache *new_shared; - struct alien_cache **new_alien = NULL; for_each_online_node(node) { - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - new_shared = NULL; - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared*cachep->batchcount, - 0xbaadf00d, gfp); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; - } - } - - n = get_node(cachep, node); - if (n) { - struct array_cache *shared = n->shared; - LIST_HEAD(list); - - spin_lock_irq(&n->list_lock); - - if (shared) - free_block(cachep, shared->entry, - shared->avail, node, &list); - - n->shared = new_shared; - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - kfree(shared); - free_alien_cache(new_alien); - continue; - } - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) { - free_alien_cache(new_alien); - kfree(new_shared); + ret = setup_kmem_cache_node(cachep, node, gfp, true); + if (ret) goto fail; - } - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - n->shared = new_shared; - n->alien = new_alien; - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - cachep->node[node] = n; } + return 0; fail: @@ -3752,7 +3919,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; if (!prev) - goto alloc_node; + goto setup_node; for_each_online_cpu(cpu) { LIST_HEAD(list); @@ -3769,8 +3936,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, } free_percpu(prev); -alloc_node: - return alloc_kmem_cache_node(cachep, gfp); +setup_node: + return setup_kmem_cache_nodes(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3804,6 +3971,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; + err = cache_random_seq_create(cachep, gfp); + if (err) + goto end; + if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; @@ -3857,6 +4028,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: if (err) pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); @@ -3869,29 +4041,26 @@ skip_setup: * if drain_array() is used on the shared array. */ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int force, int node) + struct array_cache *ac, int node) { LIST_HEAD(list); - int tofree; + + /* ac from n->shared can be freed if we don't hold the slab_mutex. */ + check_mutex_acquired(); if (!ac || !ac->avail) return; - if (ac->touched && !force) { + + if (ac->touched) { ac->touched = 0; - } else { - spin_lock_irq(&n->list_lock); - if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node, &list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); - } - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); + return; } + + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); } /** @@ -3929,7 +4098,7 @@ static void cache_reap(struct work_struct *w) reap_alien(searchp, n); - drain_array(searchp, n, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), node); /* * These are racy checks but it does not matter @@ -3940,7 +4109,7 @@ static void cache_reap(struct work_struct *w) n->next_reap = jiffies + REAPTIMEOUT_NODE; - drain_array(searchp, n, n->shared, 0, node); + drain_array(searchp, n, n->shared, node); if (n->free_touched) n->free_touched = 0; diff --git a/mm/slub.c b/mm/slub.c index 4dbb109eb8cd..cf1faa4d3992 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; @@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab * with available objects. */ if (!s->remote_node_defrag_ratio || @@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) * s->cpu_partial is checked locklessly (see put_cpu_partial), * so we have to make sure the change is visible. */ - kick_all_cpus_sync(); + synchronize_sched(); } flush_all(s); diff --git a/mm/swap_state.c b/mm/swap_state.c index 366ce3518703..0d457e7db8d6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); @@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return new_page; } radix_tree_preload_end(); - ClearPageSwapBacked(new_page); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely diff --git a/mm/util.c b/mm/util.c index 6cc81e7b8705..8a1b3a1fb595 100644 --- a/mm/util.c +++ b/mm/util.c @@ -346,6 +346,29 @@ void *page_rmapping(struct page *page) return __page_rmapping(page); } +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. + */ +bool page_mapped(struct page *page) +{ + int i; + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + if (PageHuge(page)) + return false; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(page_mapped); + struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; diff --git a/mm/vmscan.c b/mm/vmscan.c index 142cb61f4822..dcfdfc1a0942 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { struct page *page; - int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + nr_taken += hpage_nr_pages(page); list_move(&page->lru, dst); - nr_taken += nr_pages; break; case -EBUSY: @@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + update_lru_size(lruvec, lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); @@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { if (current_is_kswapd()) __count_zone_vm_events(PGSTEAL_KSWAPD, zone, @@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ @@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, pages_to_free); } } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); } @@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + update_lru_size(lruvec, lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 5e4300482897..5b72a8ad2813 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -570,49 +570,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) #ifdef CONFIG_NUMA /* - * zonelist = the list of zones passed to the allocator - * z = the zone from which the allocation occurred. - * - * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. - */ -void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) -{ - if (z->zone_pgdat == preferred_zone->zone_pgdat) { - __inc_zone_state(z, NUMA_HIT); - } else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); - } - if (z->node == ((flags & __GFP_OTHER_NODE) ? - preferred_zone->node : numa_node_id())) - __inc_zone_state(z, NUMA_LOCAL); - else - __inc_zone_state(z, NUMA_OTHER); -} - -/* * Determine the per node value of a stat item. */ unsigned long node_page_state(int node, enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_page_state(zones + i, item); + + return count; } #endif @@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!memmap_valid_within(pfn, page, zone)) continue; + if (page_zone(page) != zone) + continue; + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = min(block_end_pfn, end_pfn); page = pfn_to_page(pfn); - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) continue; page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (PageBuddy(page)) { pfn += (1UL << page_order(page)) - 1; continue; @@ -1378,6 +1354,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + static void vmstat_update(struct work_struct *w) { if (refresh_cpu_vm_stats(true)) { |