diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/cma.c | 3 | ||||
-rw-r--r-- | mm/compaction.c | 56 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 75 | ||||
-rw-r--r-- | mm/gup.c | 22 | ||||
-rw-r--r-- | mm/huge_memory.c | 67 | ||||
-rw-r--r-- | mm/hugetlb.c | 91 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 19 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 4 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 94 | ||||
-rw-r--r-- | mm/kasan/report.c | 5 | ||||
-rw-r--r-- | mm/khugepaged.c | 39 | ||||
-rw-r--r-- | mm/kmemleak.c | 10 | ||||
-rw-r--r-- | mm/list_lru.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 1 | ||||
-rw-r--r-- | mm/memcontrol.c | 48 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 100 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 49 | ||||
-rw-r--r-- | mm/mempolicy.c | 30 | ||||
-rw-r--r-- | mm/migrate.c | 19 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mprotect.c | 19 | ||||
-rw-r--r-- | mm/mremap.c | 34 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 263 | ||||
-rw-r--r-- | mm/percpu.c | 16 | ||||
-rw-r--r-- | mm/readahead.c | 39 | ||||
-rw-r--r-- | mm/rmap.c | 69 | ||||
-rw-r--r-- | mm/shmem.c | 32 | ||||
-rw-r--r-- | mm/slab.c | 124 | ||||
-rw-r--r-- | mm/slab.h | 19 | ||||
-rw-r--r-- | mm/slab_common.c | 37 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 21 | ||||
-rw-r--r-- | mm/swapfile.c | 15 | ||||
-rw-r--r-- | mm/truncate.c | 29 | ||||
-rw-r--r-- | mm/vmalloc.c | 196 | ||||
-rw-r--r-- | mm/vmscan.c | 46 | ||||
-rw-r--r-- | mm/vmstat.c | 95 | ||||
-rw-r--r-- | mm/workingset.c | 114 | ||||
-rw-r--r-- | mm/zsmalloc.c | 67 | ||||
-rw-r--r-- | mm/zswap.c | 172 |
44 files changed, 1116 insertions, 1062 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index be0ee11fa0d9..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 + depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG depends on NUMA default n help @@ -187,7 +187,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on !KASAN + depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator @@ -385,6 +385,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); + if (bitmap_count > bitmap_maxno) + return NULL; + for (;;) { mutex_lock(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, diff --git a/mm/compaction.c b/mm/compaction.c index 0409a4ad6ea1..223464227299 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } -/* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) -{ - struct page *page; - unsigned int count[2] = { 0, }; - - if (list_empty(&cc->migratepages)) - return; - - list_for_each_entry(page, &cc->migratepages, lru) - count[!!page_is_file_cache(page)]++; - - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); -} - /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { @@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -902,7 +888,6 @@ isolate_fail: spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } - acct_isolated(zone, cc); putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; cc->last_migrated_pfn = 0; @@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; } - acct_isolated(cc->zone, cc); return pfn; } @@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) { - acct_isolated(zone, cc); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; - } /* * Either we isolated something and proceed with migration. Or @@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, break; } - acct_isolated(zone, cc); /* Record where migration scanner will be restarted. */ cc->migrate_pfn = low_pfn; @@ -2043,33 +2024,38 @@ void kcompactd_stop(int nid) * away, we get changed to run anywhere: as the first one comes back, * restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int kcompactd_cpu_online(unsigned int cpu) { int nid; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; - mask = cpumask_of_node(pgdat->node_id); + mask = cpumask_of_node(pgdat->node_id); - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); } - return NOTIFY_OK; + return 0; } static int __init kcompactd_init(void) { int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/compaction:online", + kcompactd_cpu_online, NULL); + if (ret < 0) { + pr_err("kcompactd: failed to register hotplug callbacks.\n"); + return ret; + } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); - hotcpu_notifier(cpu_callback, 0); return 0; } subsys_initcall(kcompactd_init) diff --git a/mm/debug.c b/mm/debug.c index 9feb699c5d25..db1cd26d8752 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (reason) pr_alert("page dumped because: %s\n", reason); diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..5b4dd03130da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int i, nr; + + /* hugetlb pages are represented by one entry in the radix tree */ + nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - - if (!node) { - VM_BUG_ON_PAGE(nr != 1, page); - /* - * We need a node to properly account shadow - * entries. Don't plant any without. XXX - */ - shadow = NULL; - } - - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON_PAGE(!node && nr != 1, page); - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { @@ -790,9 +742,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); @@ -1734,6 +1684,9 @@ find_page: if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iter->type & ITER_PIPE)) + goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; /* Did it get truncated before we got the lock? */ @@ -526,7 +526,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -631,9 +631,9 @@ next_page: } while (nr_pages); return i; } -EXPORT_SYMBOL(__get_user_pages); -bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); @@ -858,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to - * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for + * tsk, mm to be specified. * * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET", - * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed - * according to the parameters "pages", "write", "force" - * respectively. + * caller if required (just like with __get_user_pages). "FOLL_GET" + * is set implicitly if "pages" is non-NULL. */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -895,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * get_user_pages_unlocked(tsk, mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead, if the two parameters - * "tsk" and "mm" are respectively equal to current and current->mm, - * or if "force" shall be set to 1 (get_user_pages_fast misses the - * "force" parameter). + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cdcd25cb30fe..cee42cf05477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &shmem_enabled_attr.attr, #endif @@ -737,8 +747,9 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - if (track_pfn_insert(vma, &pgprot, pfn)) - return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } @@ -1322,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; @@ -1377,12 +1390,23 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; @@ -1398,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); @@ -1416,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1424,13 +1450,29 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd) + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; + bool force_flush = false; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || @@ -1456,10 +1498,11 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); + if (pmd_present(pmd) && pmd_dirty(pmd)) + force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); @@ -1467,6 +1510,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); if (new_ptl != old_ptl) spin_unlock(new_ptl); + if (force_flush) + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + else + *need_flush = true; spin_unlock(old_ptl); return true; } @@ -1581,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec49d9ef1eef..3edb759c5c7d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1826,11 +1826,17 @@ static void return_unused_surplus_pages(struct hstate *h, * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. + * + * vma_add_reservation is used in error paths where a reservation must + * be restored when a newly allocated huge page must be freed. It is + * to be called after calling vma_needs_reservation to determine if a + * reservation exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, + VMA_ADD_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -1856,6 +1862,14 @@ static long __vma_reservation_common(struct hstate *h, region_abort(resv, idx, idx + 1); ret = 0; break; + case VMA_ADD_RESV: + if (vma->vm_flags & VM_MAYSHARE) + ret = region_add(resv, idx, idx + 1); + else { + region_abort(resv, idx, idx + 1); + ret = region_del(resv, idx, idx + 1); + } + break; default: BUG(); } @@ -1903,6 +1917,56 @@ static void vma_end_reservation(struct hstate *h, (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } +static long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); +} + +/* + * This routine is called to restore a reservation on error paths. In the + * specific error paths, a huge page was allocated (via alloc_huge_page) + * and is about to be freed. If a reservation for the page existed, + * alloc_huge_page would have consumed the reservation and set PagePrivate + * in the newly allocated page. When the page is freed via free_huge_page, + * the global reservation count will be incremented if PagePrivate is set. + * However, free_huge_page can not adjust the reserve map. Adjust the + * reserve map here to be consistent with global reserve count adjustments + * to be made by free_huge_page. + */ +static void restore_reserve_on_error(struct hstate *h, + struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + if (unlikely(PagePrivate(page))) { + long rc = vma_needs_reservation(h, vma, address); + + if (unlikely(rc < 0)) { + /* + * Rare out of memory condition in reserve map + * manipulation. Clear PagePrivate so that + * global reserve count will not be incremented + * by free_huge_page. This will make it appear + * as though the reservation for this page was + * consumed. This may prevent the task from + * faulting in the page at a later time. This + * is better than inconsistent global huge page + * accounting of reserve counts. + */ + ClearPagePrivate(page); + } else if (rc) { + rc = vma_add_reservation(h, vma, address); + if (unlikely(rc < 0)) + /* + * See above comment about rare out of + * memory condition. + */ + ClearPagePrivate(page); + } else + vma_end_reservation(h, vma, address); + } +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -3222,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; @@ -3272,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } pte = huge_ptep_get_and_clear(mm, address, ptep); - tlb_remove_tlb_entry(tlb, ptep, address); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -3386,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) { + pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: @@ -3498,6 +3569,7 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: + restore_reserve_on_error(h, vma, address, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -3646,8 +3718,7 @@ retry: vma_end_reservation(h, vma, address); } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -3668,7 +3739,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3680,6 +3751,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); + restore_reserve_on_error(h, vma, address, page); put_page(page); goto out; } @@ -3822,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); @@ -4264,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 70c009741aab..0e9505f66ec1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -764,6 +764,25 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); +/* Emitted by compiler to poison large objects when they go out of scope. */ +void __asan_poison_stack_memory(const void *addr, size_t size) +{ + /* + * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded + * by redzones, so we simply round up size to simplify logic. + */ + kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), + KASAN_USE_AFTER_SCOPE); +} +EXPORT_SYMBOL(__asan_poison_stack_memory); + +/* Emitted by compiler to unpoison large objects when they go into scope. */ +void __asan_unpoison_stack_memory(const void *addr, size_t size) +{ + kasan_unpoison_shadow(addr, size); +} +EXPORT_SYMBOL(__asan_unpoison_stack_memory); + #ifdef CONFIG_MEMORY_HOTPLUG static int kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e5c2181fee6f..1c260e6b3b3c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -21,6 +21,7 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_USE_AFTER_SCOPE 0xF8 /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION @@ -53,6 +54,9 @@ struct kasan_global { #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif }; /** diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index baabaad4a4aa..dae929c02bbb 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) qlist_init(from); } -static void qlist_move(struct qlist_head *from, struct qlist_node *last, - struct qlist_head *to, size_t size) -{ - if (unlikely(last == from->tail)) { - qlist_move_all(from, to); - return; - } - if (qlist_empty(to)) - to->head = from->head; - else - to->tail->next = from->head; - to->tail = last; - from->head = last->next; - last->next = NULL; - from->bytes -= size; - to->bytes += size; -} - +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) /* * The object quarantine consists of per-cpu queues and a global queue, @@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last, */ static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); -static struct qlist_head global_quarantine; +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); /* Maximum size of the global queue. */ -static unsigned long quarantine_size; +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; /* * The fraction of physical memory the quarantine is allowed to occupy. @@ -124,9 +120,6 @@ static unsigned long quarantine_size; */ #define QUARANTINE_FRACTION 32 -#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) -#define QUARANTINE_PERCPU_SIZE (1 << 20) - static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { return virt_to_head_page(qlink)->slab_cache; @@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(!qlist_empty(&temp))) { spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_all(&temp, &global_quarantine); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } spin_unlock_irqrestore(&quarantine_lock, flags); } } void quarantine_reduce(void) { - size_t new_quarantine_size, percpu_quarantines; + size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; struct qlist_head to_free = QLIST_INIT; - size_t size_to_free = 0; - struct qlist_node *last; - if (likely(READ_ONCE(global_quarantine.bytes) <= - READ_ONCE(quarantine_size))) + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) return; spin_lock_irqsave(&quarantine_lock, flags); @@ -214,24 +216,23 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? - 0 : new_quarantine_size - percpu_quarantines; - WRITE_ONCE(quarantine_size, new_quarantine_size); - - last = global_quarantine.head; - while (last) { - struct kmem_cache *cache = qlink_to_cache(last); - - size_to_free += cache->size; - if (!last->next || size_to_free > - global_quarantine.bytes - QUARANTINE_LOW_SIZE) - break; - last = last->next; + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; } - qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); @@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg) void quarantine_remove_cache(struct kmem_cache *cache) { - unsigned long flags; + unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_cache(&global_quarantine, &to_free, cache); + for (i = 0; i < QUARANTINE_BATCHES; i++) + qlist_move_cache(&global_quarantine[i], &to_free, cache); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 24c1211fe9d5..b82b3e215157 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -90,6 +90,9 @@ static void print_error_description(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; + case KASAN_USE_AFTER_SCOPE: + bug_type = "use-after-scope"; + break; } pr_err("BUG: KASAN: %s in %pS at addr %p\n", @@ -133,6 +136,8 @@ static void kasan_end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); kasan_enable_current(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 728d7790dc2d..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -103,6 +103,7 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +#ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -295,6 +296,7 @@ struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; +#endif /* CONFIG_SYSFS */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -1240,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1264,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); @@ -1401,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm, spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_lookup_slot(&mapping->page_tree, index); + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, + &mapping->tree_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1421,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); + slot = radix_tree_iter_next(&iter); index++; continue; out_lru: @@ -1519,9 +1542,11 @@ tree_unlocked: if (!page || iter.index < page->index) { if (!nr_none) break; - /* Put holes back where they were */ - radix_tree_replace_slot(slot, NULL); nr_none--; + /* Put holes back where they were */ + radix_tree_delete(&mapping->page_tree, + iter.index); + slot = radix_tree_iter_next(&iter); continue; } @@ -1530,11 +1555,13 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a5e453cf05c4..da3436953022 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -19,7 +19,7 @@ * * * For more information on the algorithm and kmemleak usage, please see - * Documentation/kmemleak.txt. + * Documentation/dev-tools/kmemleak.rst. * * Notes on locking * ---------------- @@ -1414,6 +1414,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); + scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ @@ -1453,8 +1454,11 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL); + void *stack = try_get_task_stack(p); + if (stack) { + scan_block(stack, stack + THREAD_SIZE, NULL); + put_task_stack(p); + } } while_each_thread(g, p); read_unlock(&tasklist_lock); } diff --git a/mm/list_lru.c b/mm/list_lru.c index 1d05cb9d363d..234676e31edd 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -554,6 +554,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, err = memcg_init_list_lru(lru, memcg_aware); if (err) { kfree(lru->node); + /* Do this so a list_lru_destroy() doesn't crash: */ + lru->node = NULL; goto out; } diff --git a/mm/madvise.c b/mm/madvise.c index 93fb63e88b5e..0e3828eae9f8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae052b5e3315..175ec51c346d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1816,22 +1816,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static int memcg_cpu_hotplug_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +static int memcg_hotplug_cpu_dead(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - if (action == CPU_ONLINE) - return NOTIFY_OK; - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - return NOTIFY_OK; + return 0; } static void reclaim_high(struct mem_cgroup *memcg, @@ -1917,6 +1908,15 @@ retry: current->flags & PF_EXITING)) goto force; + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + if (unlikely(task_in_memcg_oom(current))) goto nomem; @@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; +static struct workqueue_struct *memcg_kmem_cache_create_wq; + static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - schedule_work(&cw->work); + queue_work(memcg_kmem_cache_create_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -5765,16 +5767,28 @@ __setup("cgroup.memory=", cgroup_memory); /* * subsys_initcall() for memory controller. * - * Some parts like hotcpu_notifier() have to be initialized from this context - * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically - * everything that doesn't depend on a specific mem_cgroup structure should - * be initialized from here. + * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this + * context because of lock dependencies (cgroup_lock -> cpu hotplug) but + * basically everything that doesn't depend on a specific mem_cgroup structure + * should be initialized from here. */ static int __init mem_cgroup_init(void) { int cpu, node; - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); +#ifndef CONFIG_SLOB + /* + * Kmem cache creation is mostly done with the slab_mutex held, + * so use a special workqueue to avoid stalling all worker + * threads in case lots of cgroups are created simultaneously. + */ + memcg_kmem_cache_create_wq = + alloc_ordered_workqueue("memcg_kmem_cache_create", 0); + BUG_ON(!memcg_kmem_cache_create_wq); +#endif + + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, + memcg_hotplug_cpu_dead); for_each_possible_cpu(cpu) INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index de88f33519c0..19e796d36a62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1112,10 +1112,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(p); + if (!PageAnon(p) || unlikely(split_huge_page(p))) { + unlock_page(p); + if (!PageAnon(p)) pr_err("Memory failure: %#lx: non anonymous thp\n", pfn); else @@ -1126,9 +1126,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(p); - put_hwpoison_page(hpage); + unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } diff --git a/mm/memory.c b/mm/memory.c index e18c57bdc75c..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1172,7 +1174,6 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } @@ -1213,11 +1214,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } @@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1637,8 +1633,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); @@ -1655,8 +1651,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, pfn); /* * If we don't have pte special, then we have to use the pfn_valid() @@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; @@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); return VM_FAULT_FALLBACK; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..e43142c15631 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -1741,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); @@ -2131,7 +2097,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long section_nr = pfn_to_section_nr(pfn); @@ -2158,20 +2123,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b859af06b87..6d3639e1f254 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) @@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { get_page(page); spin_unlock(ptl); @@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: + if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + else { /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + return node_zonelist(nd, gfp); } diff --git a/mm/migrate.c b/mm/migrate.c index 99250aee1ac1..0ed24b1fa77b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use * __PageMovable because LRU page's mapping cannot have @@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { putback_lru_page(page); + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); @@ -1121,8 +1121,15 @@ out: * restored. */ list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } /* diff --git a/mm/mlock.c b/mm/mlock.c index 145a4258ddbc..cdbed8aaa426 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -190,10 +190,13 @@ unsigned int munlock_vma_page(struct page *page) */ spin_lock_irq(zone_lru_lock(zone)); - nr_pages = hpage_nr_pages(page); - if (!TestClearPageMlocked(page)) + if (!TestClearPageMlocked(page)) { + /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ + nr_pages = 1; goto unlock_out; + } + nr_pages = hpage_nr_pages(page); __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 11936526b08b..cc2459c57f60 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + int target_node = NUMA_NO_NODE; pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); if (!pte) return 0; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + continue; } ptent = ptep_modify_prot_start(mm, addr, pte); @@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); if (pmd_trans_unstable(pmd)) continue; } else { @@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return do_mprotect_pkey(start, len, prot, -1); } +#ifdef CONFIG_ARCH_HAS_PKEYS + SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { @@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) */ return ret; } + +#endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/mm/mremap.c b/mm/mremap.c index da22ad2a5678..30d7d2482eea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks) + unsigned long new_addr, bool need_rmap_locks, bool *need_flush) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + bool force_flush = false; + unsigned long len = old_end - old_addr; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma @@ -146,7 +148,19 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; + pte = ptep_get_and_clear(mm, old_addr, old_pte); + /* + * If we are remapping a dirty PTE, make sure + * to flush TLB before we drop the PTL for the + * old PTE or we may race with page_mkclean(). + * + * This check has to be done after we removed the + * old PTE from page tables or another thread may + * dirty it after the check and before the removal. + */ + if (pte_present(pte) && pte_dirty(pte)) + force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); @@ -156,6 +170,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); + else + *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -201,13 +219,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd); + old_end, old_pmd, new_pmd, + &need_flush); if (need_rmap_locks) drop_rmap_locks(vma); - if (moved) { - need_flush = true; + if (moved) continue; - } } split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) @@ -220,11 +237,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, extent = next - new_addr; if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr, need_rmap_locks); - need_flush = true; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, + new_pmd, new_addr, need_rmap_locks, &need_flush); } - if (likely(need_flush)) + if (need_flush) flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); diff --git a/mm/nommu.c b/mm/nommu.c index db5fd1795298..8b8faaf2a9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,7 +109,7 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..f64e7bcb43b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -volatile u64 latent_entropy __latent_entropy; +volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif @@ -2058,8 +2058,12 @@ out_unlock: * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. */ -static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct zone *zone; struct page *page; int order; + bool ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { - /* Preserve at least one pageblock */ - if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal @@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - move_freepages_block(zone, page, ac->migratetype); - spin_unlock_irqrestore(&zone->lock, flags); - return; + ret = move_freepages_block(zone, page, ac->migratetype); + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } } spin_unlock_irqrestore(&zone->lock, flags); } + + return false; } /* Remove an element from the buddy allocator from the fallback list */ @@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, bool cold) { - int i; + int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; + alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'alloced' which is the number of + * pages added to the pcp list. + */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return i; + return alloced; } #ifdef CONFIG_NUMA @@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3305,7 +3340,7 @@ retry: * Shrink them them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac); + unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) - return false; + if (*no_progress_loops > MAX_RECLAIM_RETRIES) { + /* Before OOM, exhaust highatomic_reserve */ + return unreserve_highatomic_pageblock(ac, true); + } /* * Keep reclaiming pages while there is a chance this will lead @@ -3658,7 +3695,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { warn_alloc(gfp_mask, - "page alloction stalls for %ums, order:%u\n", + "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; } @@ -4224,7 +4261,7 @@ static void show_migration_types(unsigned char type) } *p = '\0'; - printk("(%s) ", tmp); + printk(KERN_CONT "(%s) ", tmp); } /* @@ -4335,7 +4372,8 @@ void show_free_areas(unsigned int filter) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); - printk("%s" + printk(KERN_CONT + "%s" " free:%lukB" " min:%lukB" " low:%lukB" @@ -4382,8 +4420,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %ld", zone->lowmem_reserve[i]); - printk("\n"); + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { @@ -4394,7 +4432,7 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); - printk("%s: ", zone->name); + printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -4412,11 +4450,12 @@ void show_free_areas(unsigned int filter) } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - printk("%lu*%lukB ", nr[order], K(1UL) << order); + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } - printk("= %lukB\n", K(total)); + printk(KERN_CONT "= %lukB\n", K(total)); } hugetlb_show_meminfo(); @@ -4977,72 +5016,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) } /* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -5304,49 +5277,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5297,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5309,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } @@ -6508,8 +6436,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } @@ -6600,38 +6528,39 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -static int page_alloc_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int page_alloc_cpu_dead(unsigned int cpu) { - int cpu = (unsigned long)hcpu; - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - lru_add_drain_cpu(cpu); - drain_pages(cpu); + lru_add_drain_cpu(cpu); + drain_pages(cpu); - /* - * Spill the event counters of the dead processor - * into the current processors event counters. - * This artificially elevates the count of the current - * processor. - */ - vm_events_fold_cpu(cpu); + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ + vm_events_fold_cpu(cpu); - /* - * Zero the differential counters of the dead processor - * so that the vm statistics are consistent. - * - * This is only okay since the processor is dead and cannot - * race with what we are doing. - */ - cpu_vm_stats_fold(cpu); - } - return NOTIFY_OK; + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ + cpu_vm_stats_fold(cpu); + return 0; } void __init page_alloc_init(void) { - hotcpu_notifier(page_alloc_cpu_notify, 0); + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, + "mm/page_alloc:dead", NULL, + page_alloc_cpu_dead); + WARN_ON(ret < 0); } /* diff --git a/mm/percpu.c b/mm/percpu.c index 255714302394..f696385bcc44 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, size_t pages_size; struct page **pages; int unit, i, j, rc; + int upa; + int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size, if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* allocate pages */ j = 0; - for (unit = 0; unit < num_possible_cpus(); unit++) + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { - unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", - psize_str, cpu); + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } + } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; diff --git a/mm/readahead.c b/mm/readahead.c index c8a955b1297e..c4ca70239233 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -207,12 +207,21 @@ out: * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long max_pages; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { int err; @@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = ra->ra_pages; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages = ra->ra_pages; pgoff_t prev_offset; /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + + /* * start of file */ if (!offset) @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping, if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max); + start = page_cache_next_hole(mapping, offset + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max) + if (!start || start - offset > max_pages) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ ra->size += req_size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping, /* * oversize read */ - if (req_size > max) + if (req_size > max_pages) goto initial_readahead; /* @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max)) + if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) goto readit; /* @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping, initial_readahead: ra->start = offset; - ra->size = get_init_ra_size(req_size, max); + ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: @@ -454,7 +471,7 @@ readit: * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max); + ra->async_size = get_next_ra_size(ra, max_pages); ra->size += ra->async_size; } diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..91619fd70939 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, } /** - * anon_vma_prepare - attach an anon_vma to a memory region + * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * - * The common case will be that we already have one, but if + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we @@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * This must be called with the mmap_sem held for reading. */ -int anon_vma_prepare(struct vm_area_struct *vma) +int __anon_vma_prepare(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); - if (unlikely(!anon_vma)) { - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_enomem; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; + } - anon_vma = find_mergeable_anon_vma(vma); + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; allocated = NULL; - if (!anon_vma) { - anon_vma = anon_vma_alloc(); - if (unlikely(!anon_vma)) - goto out_enomem_free_avc; - allocated = anon_vma; - } + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); - anon_vma_lock_write(anon_vma); - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { - vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; - allocated = NULL; - avc = NULL; - } - spin_unlock(&mm->page_table_lock); - anon_vma_unlock_write(anon_vma); + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); - if (unlikely(allocated)) - put_anon_vma(allocated); - if (unlikely(avc)) - anon_vma_chain_free(avc); - } return 0; out_enomem_free_avc: diff --git a/mm/shmem.c b/mm/shmem.c index ad7813d73ea7..abd7403aba41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages) static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { + struct radix_tree_node *node; void **pslot; void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (!pslot) + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + if (!item) return -ENOENT; - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - radix_tree_replace_slot(pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } @@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, int shmem_huge __read_mostly; +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge) return "bad_val"; } } +#endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) @@ -1483,6 +1486,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); + __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1537,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct mem_cgroup *memcg; @@ -1587,7 +1592,6 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); charge_mm = fault_mm ? : current->mm; @@ -1835,7 +1839,6 @@ unlock: put_page(page); } if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1846,6 +1849,18 @@ unlock: return error; } +/* + * This is like autoremove_wake_function, but it removes the wait queue + * entry unconditionally - even if something else had already woken the + * target. + */ +static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + list_del_init(&wait->task_list); + return ret; +} + static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); @@ -1881,7 +1896,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT(shmem_fault_wait); + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && @@ -2663,6 +2678,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); spin_unlock(&inode->i_lock); error = 0; goto out; diff --git a/mm/slab.c b/mm/slab.c index 090fb26b3a39..87b29e76cafd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,6 +227,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); + parent->total_slabs = 0; + parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; @@ -966,7 +968,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * guaranteed to be valid until irq is re-enabled, because it will be * freed after synchronize_sched(). */ - if (force_change) + if (old_shared && force_change) synchronize_sched(); fail: @@ -1365,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { #if DEBUG struct kmem_cache_node *n; - struct page *page; unsigned long flags; int node; static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -1380,29 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, num_objs = 0, free_objects = 0; - unsigned long active_slabs = 0, num_slabs = 0; + unsigned long total_slabs, free_slabs, free_objs; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) { - active_objs += cachep->num; - active_slabs++; - } - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; - active_slabs++; - } - list_for_each_entry(page, &n->slabs_free, lru) - num_slabs++; - - free_objects += n->free_objects; + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_slabs += active_slabs; - num_objs = num_slabs * cachep->num; - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, num_objs, - free_objects); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); } #endif } @@ -2314,6 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); + n->free_slabs--; + n->total_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2327,7 +2319,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2347,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) @@ -2748,10 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); - if (!page->active) + n->total_slabs++; + if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); - else + n->free_slabs++; + } else fixup_slab_list(cachep, n, page, &list); + STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -2896,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (!page->active) + if (!page->active) { list_add_tail(&page->lru, &n->slabs_free); - else + n->free_slabs++; + } else list_add_tail(&page->lru, &n->slabs_partial); list_for_each_entry(page, &n->slabs_partial, lru) { @@ -2906,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, return page; } + n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { - if (!PageSlabPfmemalloc(page)) + if (!PageSlabPfmemalloc(page)) { + n->free_slabs--; return page; + } } return NULL; @@ -2918,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); + assert_spin_locked(&n->list_lock); + page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_free, struct page, + lru); + if (page) + n->free_slabs--; } if (sk_memalloc_socks()) - return get_valid_first_slab(n, page, pfmemalloc); + page = get_valid_first_slab(n, page, pfmemalloc); return page; } @@ -3427,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) + if (page->active == 0) { list_add(&page->lru, &n->slabs_free); - else { + n->free_slabs++; + } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3443,6 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); + n->free_slabs--; + n->total_slabs--; } } @@ -4094,61 +4098,33 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; - const char *name; - char *error = NULL; + unsigned long active_objs, num_objs, active_slabs; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; int node; struct kmem_cache_node *n; - active_objs = 0; - num_slabs = 0; for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) { - if (page->active != cachep->num && !error) - error = "slabs_full accounting error"; - active_objs += cachep->num; - active_slabs++; - } - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; - active_slabs++; - } - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; - num_slabs++; - } - free_objects += n->free_objects; + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; + free_objs += n->free_objects; + if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_slabs += active_slabs; - num_objs = num_slabs * cachep->num; - if (num_objs - active_objs != free_objects && !error) - error = "free_objects accounting error"; - - name = cachep->name; - if (error) - pr_err("slab: cache %s error: %s\n", name, error); + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; + active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; sinfo->active_slabs = active_slabs; - sinfo->num_slabs = num_slabs; + sinfo->num_slabs = total_slabs; sinfo->shared_avail = shared_avail; sinfo->limit = cachep->limit; sinfo->batchcount = cachep->batchcount; diff --git a/mm/slab.h b/mm/slab.h index 9653f2e2591a..de6579dc362c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_CACHE_FLAGS (0) #endif +/* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_NOTRACK | \ + SLAB_ACCOUNT) + int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -432,6 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 71f0b28a1bec..ae323841adb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -533,8 +539,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, s = create_cache(cache_name, root_cache->object_size, root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + root_cache->flags & CACHE_CREATE_MASK, + root_cache->ctor, memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); +#ifdef CONFIG_SLUB + /* + * In case of SLUB, we need to disable empty slab caching to + * avoid pinning the offline memory cgroup by freeable kmem + * pages charged to it. SLAB doesn't need this, as it + * periodically purges unused slabs. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; + if (c) { + c->cpu_partial = 0; + c->min_partial = 0; + } + } + mutex_unlock(&slab_mutex); + /* + * kmem_cache->cpu_partial is checked locklessly (see + * put_cpu_partial()). Make sure the change is visible. + */ + synchronize_sched(); +#endif + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) @@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c, true); + __kmem_cache_shrink(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 2b3e740609e9..067598a00849 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (unlikely(!df.page)) + if (!df.page) continue; slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); @@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2210de290b54..1c6e0321205d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { + cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; + cond_resched(); } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; @@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - if (frontswap) { - if (frontswap_test(si, i)) - break; - else - continue; - } count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - break; + if (!frontswap || frontswap_test(si, i)) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); } return i; } @@ -2224,6 +2223,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } diff --git a/mm/truncate.c b/mm/truncate.c index a01cce450a26..fd97f1dbce29 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } @@ -283,7 +268,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -371,7 +356,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -492,7 +477,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { @@ -612,7 +597,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f2481cb4e6b2..a5584384eabc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + might_sleep(); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -615,59 +622,40 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; + bool do_free = false; - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); - - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; + do_free = true; } - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!do_free) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); } - spin_unlock(&purge_lock); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -676,9 +664,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (mutex_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + } } /* @@ -686,9 +675,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -711,22 +701,13 @@ static void free_vmap_area_noflush(struct vmap_area *va) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); -} - -/* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) @@ -740,16 +721,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1070,6 +1041,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1094,7 +1067,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -1107,7 +1084,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1116,10 +1095,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1455,6 +1438,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; @@ -1510,7 +1495,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1533,11 +1550,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -2574,32 +2589,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_area_lock) { - loff_t n = *pos; - struct vmap_area *va; - spin_lock(&vmap_area_lock); - va = list_first_entry(&vmap_area_list, typeof(*va), list); - while (n > 0 && &va->list != &vmap_area_list) { - n--; - va = list_next_entry(va, list); - } - if (!n && &va->list != &vmap_area_list) - return va; - - return NULL; - + return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vmap_area *va = p, *next; - - ++*pos; - next = list_next_entry(va, list); - if (&next->list != &vmap_area_list) - return next; - - return NULL; + return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) @@ -2634,9 +2630,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vmap_area *va = p; + struct vmap_area *va; struct vm_struct *v; + va = list_entry(p, struct vmap_area, list); + /* * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. diff --git a/mm/vmscan.c b/mm/vmscan.c index 744f926af442..6aa5b01d3e75 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); @@ -2354,6 +2362,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc } } + cond_resched(); + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) continue; @@ -3043,7 +3053,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + current->flags |= PF_MEMALLOC; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3554,24 +3566,21 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int kswapd_cpu_online(unsigned int cpu) { int nid; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; - mask = cpumask_of_node(pgdat->node_id); + mask = cpumask_of_node(pgdat->node_id); - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kswapd, mask); - } + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kswapd, mask); } - return NOTIFY_OK; + return 0; } /* @@ -3613,12 +3622,15 @@ void kswapd_stop(int nid) static int __init kswapd_init(void) { - int nid; + int nid, ret; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - hotcpu_notifier(cpu_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/vmscan:online", kswapd_cpu_online, + NULL); + WARN_ON(ret < 0); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 604f26a4f696..7c28df36f50f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1720,75 +1720,66 @@ static void __init start_shepherd_timer(void) static void __init init_cpu_node_state(void) { - int cpu; + int node; - get_online_cpus(); - for_each_online_cpu(cpu) - node_set_state(cpu_to_node(cpu), N_CPU); - put_online_cpus(); + for_each_online_node(node) { + if (cpumask_weight(cpumask_of_node(node)) > 0) + node_set_state(node, N_CPU); + } } -static void vmstat_cpu_dead(int node) +static int vmstat_cpu_online(unsigned int cpu) { - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (cpu_to_node(cpu) == node) - goto end; + refresh_zone_stat_thresholds(); + node_set_state(cpu_to_node(cpu), N_CPU); + return 0; +} - node_clear_state(node, N_CPU); -end: - put_online_cpus(); +static int vmstat_cpu_down_prep(unsigned int cpu) +{ + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + return 0; } -/* - * Use the cpu notifier to insure that the thresholds are recalculated - * when necessary. - */ -static int vmstat_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - refresh_zone_stat_thresholds(); - node_set_state(cpu_to_node(cpu), N_CPU); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - refresh_zone_stat_thresholds(); - vmstat_cpu_dead(cpu_to_node(cpu)); - break; - default: - break; - } - return NOTIFY_OK; +static int vmstat_cpu_dead(unsigned int cpu) +{ + const struct cpumask *node_cpus; + int node; + + node = cpu_to_node(cpu); + + refresh_zone_stat_thresholds(); + node_cpus = cpumask_of_node(node); + if (cpumask_weight(node_cpus) > 0) + return 0; + + node_clear_state(node, N_CPU); + return 0; } -static struct notifier_block vmstat_notifier = - { &vmstat_cpuup_callback, NULL, 0 }; #endif static int __init setup_vmstat(void) { #ifdef CONFIG_SMP - cpu_notifier_register_begin(); - __register_cpu_notifier(&vmstat_notifier); + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", + NULL, vmstat_cpu_dead); + if (ret < 0) + pr_err("vmstat: failed to register 'dead' hotplug state\n"); + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online", + vmstat_cpu_online, + vmstat_cpu_down_prep); + if (ret < 0) + pr_err("vmstat: failed to register 'online' hotplug state\n"); + + get_online_cpus(); init_cpu_node_state(); + put_online_cpus(); start_shepherd_timer(); - cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/mm/workingset.c b/mm/workingset.c index 617475f529f4..241fa5d6b3b2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> @@ -334,48 +335,81 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; - unsigned long pages; + unsigned long nodes; + unsigned long cache; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); - if (memcg_kmem_enabled()) { - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - /* - * Active cache pages are limited to 50% of memory, and shadow - * entries that represent a refault distance bigger than that - * do not have any effect. Limit the number of shadow nodes - * such that shadow entries do not exceed the number of active - * cache pages, assuming a worst-case node population density - * of 1/8th on average. + * Approximate a reasonable limit for the radix tree nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. * * On 64-bit with 7 radix_tree_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume - * ~2% of available memory: + * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ - max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + if (sc->memcg) { + cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + } else { + cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } + max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - BUG_ON(!workingset_node_shadows(node)); - BUG_ON(workingset_node_pages(node)); - + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(node->count != node->exceptional)) + goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(!mapping->nrexceptional)) + goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); - BUG_ON(!mapping->nrexceptional); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - BUG_ON(workingset_node_shadows(node)); + if (WARN_ON_ONCE(node->exceptional)) + goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - if (!__radix_tree_delete_node(&mapping->page_tree, node)) - BUG(); + __radix_tree_delete_node(&mapping->page_tree, node); +out_invalid: spin_unlock(&mapping->tree_lock); ret = LRU_REMOVED_RETRY; out: @@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -492,7 +532,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -500,7 +540,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b0bc023d25c5..9cc3c0b2c2c1 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1284,61 +1284,21 @@ out: #endif /* CONFIG_PGTABLE_MAPPING */ -static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, - void *pcpu) +static int zs_cpu_prepare(unsigned int cpu) { - int ret, cpu = (long)pcpu; struct mapping_area *area; - switch (action) { - case CPU_UP_PREPARE: - area = &per_cpu(zs_map_area, cpu); - ret = __zs_cpu_up(area); - if (ret) - return notifier_from_errno(ret); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - break; - } - - return NOTIFY_OK; + area = &per_cpu(zs_map_area, cpu); + return __zs_cpu_up(area); } -static struct notifier_block zs_cpu_nb = { - .notifier_call = zs_cpu_notifier -}; - -static int zs_register_cpu_notifier(void) +static int zs_cpu_dead(unsigned int cpu) { - int cpu, uninitialized_var(ret); - - cpu_notifier_register_begin(); - - __register_cpu_notifier(&zs_cpu_nb); - for_each_online_cpu(cpu) { - ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) - break; - } - - cpu_notifier_register_done(); - return notifier_to_errno(ret); -} - -static void zs_unregister_cpu_notifier(void) -{ - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - __unregister_cpu_notifier(&zs_cpu_nb); + struct mapping_area *area; - cpu_notifier_register_done(); + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + return 0; } static void __init init_zs_size_classes(void) @@ -2534,10 +2494,10 @@ static int __init zs_init(void) if (ret) goto out; - ret = zs_register_cpu_notifier(); - + ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", + zs_cpu_prepare, zs_cpu_dead); if (ret) - goto notifier_fail; + goto hp_setup_fail; init_zs_size_classes(); @@ -2549,8 +2509,7 @@ static int __init zs_init(void) return 0; -notifier_fail: - zs_unregister_cpu_notifier(); +hp_setup_fail: zsmalloc_unmount(); out: return ret; @@ -2562,7 +2521,7 @@ static void __exit zs_exit(void) zpool_unregister_driver(&zs_zpool_driver); #endif zsmalloc_unmount(); - zs_unregister_cpu_notifier(); + cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); } diff --git a/mm/zswap.c b/mm/zswap.c index 275b22cc8df4..067a0d62f318 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -118,7 +118,7 @@ struct zswap_pool { struct kref kref; struct list_head list; struct work_struct work; - struct notifier_block notifier; + struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -352,143 +352,58 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, **********************************/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); -static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu) +static int zswap_dstmem_prepare(unsigned int cpu) { u8 *dst; - switch (action) { - case CPU_UP_PREPARE: - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); - return NOTIFY_BAD; - } - per_cpu(zswap_dstmem, cpu) = dst; - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - break; - default: - break; + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + return -ENOMEM; } - return NOTIFY_OK; + per_cpu(zswap_dstmem, cpu) = dst; + return 0; } -static int zswap_cpu_dstmem_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) +static int zswap_dstmem_dead(unsigned int cpu) { - return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu); -} + u8 *dst; -static struct notifier_block zswap_dstmem_notifier = { - .notifier_call = zswap_cpu_dstmem_notifier, -}; + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; -static int __init zswap_cpu_dstmem_init(void) -{ - unsigned long cpu; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) == - NOTIFY_BAD) - goto cleanup; - __register_cpu_notifier(&zswap_dstmem_notifier); - cpu_notifier_register_done(); return 0; - -cleanup: - for_each_online_cpu(cpu) - __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); - cpu_notifier_register_done(); - return -ENOMEM; -} - -static void zswap_cpu_dstmem_destroy(void) -{ - unsigned long cpu; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); - __unregister_cpu_notifier(&zswap_dstmem_notifier); - cpu_notifier_register_done(); } -static int __zswap_cpu_comp_notifier(struct zswap_pool *pool, - unsigned long action, unsigned long cpu) +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_comp *tfm; - switch (action) { - case CPU_UP_PREPARE: - if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) - break; - tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - pr_err("could not alloc crypto comp %s : %ld\n", - pool->tfm_name, PTR_ERR(tfm)); - return NOTIFY_BAD; - } - *per_cpu_ptr(pool->tfm, cpu) = tfm; - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - tfm = *per_cpu_ptr(pool->tfm, cpu); - if (!IS_ERR_OR_NULL(tfm)) - crypto_free_comp(tfm); - *per_cpu_ptr(pool->tfm, cpu) = NULL; - break; - default: - break; - } - return NOTIFY_OK; -} - -static int zswap_cpu_comp_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - unsigned long cpu = (unsigned long)pcpu; - struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier); - - return __zswap_cpu_comp_notifier(pool, action, cpu); -} + if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) + return 0; -static int zswap_cpu_comp_init(struct zswap_pool *pool) -{ - unsigned long cpu; - - memset(&pool->notifier, 0, sizeof(pool->notifier)); - pool->notifier.notifier_call = zswap_cpu_comp_notifier; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) == - NOTIFY_BAD) - goto cleanup; - __register_cpu_notifier(&pool->notifier); - cpu_notifier_register_done(); + tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); + if (IS_ERR_OR_NULL(tfm)) { + pr_err("could not alloc crypto comp %s : %ld\n", + pool->tfm_name, PTR_ERR(tfm)); + return -ENOMEM; + } + *per_cpu_ptr(pool->tfm, cpu) = tfm; return 0; - -cleanup: - for_each_online_cpu(cpu) - __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); - cpu_notifier_register_done(); - return -ENOMEM; } -static void zswap_cpu_comp_destroy(struct zswap_pool *pool) +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { - unsigned long cpu; + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_comp *tfm; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); - __unregister_cpu_notifier(&pool->notifier); - cpu_notifier_register_done(); + tfm = *per_cpu_ptr(pool->tfm, cpu); + if (!IS_ERR_OR_NULL(tfm)) + crypto_free_comp(tfm); + *per_cpu_ptr(pool->tfm, cpu) = NULL; + return 0; } /********************************* @@ -569,6 +484,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -593,7 +509,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } - if (zswap_cpu_comp_init(pool)) + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) goto error; pr_debug("using %s compressor\n", pool->tfm_name); @@ -647,7 +565,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) { zswap_pool_debug("destroying", pool); - zswap_cpu_comp_destroy(pool); + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->tfm); zpool_destroy_pool(pool->zpool); kfree(pool); @@ -1238,6 +1156,7 @@ static void __exit zswap_debugfs_exit(void) { } static int __init init_zswap(void) { struct zswap_pool *pool; + int ret; zswap_init_started = true; @@ -1246,11 +1165,20 @@ static int __init init_zswap(void) goto cache_fail; } - if (zswap_cpu_dstmem_init()) { + ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", + zswap_dstmem_prepare, zswap_dstmem_dead); + if (ret) { pr_err("dstmem alloc failed\n"); goto dstmem_fail; } + ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, + "mm/zswap_pool:prepare", + zswap_cpu_comp_prepare, + zswap_cpu_comp_dead); + if (ret) + goto hp_fail; + pool = __zswap_pool_create_fallback(); if (!pool) { pr_err("pool creation failed\n"); @@ -1267,7 +1195,9 @@ static int __init init_zswap(void) return 0; pool_fail: - zswap_cpu_dstmem_destroy(); + cpuhp_remove_state_nocalls(CPUHP_MM_ZSWP_POOL_PREPARE); +hp_fail: + cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); dstmem_fail: zswap_entry_cache_destroy(); cache_fail: |