diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 29 | ||||
-rw-r--r-- | mm/gup.c | 48 | ||||
-rw-r--r-- | mm/huge_memory.c | 40 | ||||
-rw-r--r-- | mm/hugetlb.c | 15 | ||||
-rw-r--r-- | mm/kasan/Makefile | 3 | ||||
-rw-r--r-- | mm/kasan/common.c | 43 | ||||
-rw-r--r-- | mm/kasan/report.c | 10 | ||||
-rw-r--r-- | mm/kmemleak.c | 42 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory.c | 6 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/mmu_gather.c | 129 | ||||
-rw-r--r-- | mm/page_alloc.c | 64 | ||||
-rw-r--r-- | mm/page_owner.c | 82 | ||||
-rw-r--r-- | mm/percpu.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 58 | ||||
-rw-r--r-- | mm/slab.c | 52 | ||||
-rw-r--r-- | mm/slub.c | 21 | ||||
-rw-r--r-- | mm/swapfile.c | 32 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 113 | ||||
-rw-r--r-- | mm/vmscan.c | 29 | ||||
-rw-r--r-- | mm/vmstat.c | 5 |
25 files changed, 484 insertions, 377 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index f171a83707ce..3319e0872d01 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -242,6 +242,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, bool check_target) { struct page *page = pfn_to_online_page(pfn); + struct page *block_page; struct page *end_page; unsigned long block_pfn; @@ -267,20 +268,26 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, get_pageblock_migratetype(page) != MIGRATE_MOVABLE) return false; + /* Ensure the start of the pageblock or zone is online and valid */ + block_pfn = pageblock_start_pfn(pfn); + block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn)); + if (block_page) { + page = block_page; + pfn = block_pfn; + } + + /* Ensure the end of the pageblock or zone is online and valid */ + block_pfn += pageblock_nr_pages; + block_pfn = min(block_pfn, zone_end_pfn(zone) - 1); + end_page = pfn_to_online_page(block_pfn); + if (!end_page) + return false; + /* * Only clear the hint if a sample indicates there is either a * free page or an LRU page in the block. One or other condition * is necessary for the block to be a migration source/target. */ - block_pfn = pageblock_start_pfn(pfn); - pfn = max(block_pfn, zone->zone_start_pfn); - page = pfn_to_page(pfn); - if (zone != page_zone(page)) - return false; - pfn = block_pfn + pageblock_nr_pages; - pfn = min(pfn, zone_end_pfn(zone)); - end_page = pfn_to_page(pfn); - do { if (pfn_valid_within(pfn)) { if (check_source && PageLRU(page)) { @@ -309,7 +316,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, static void __reset_isolation_suitable(struct zone *zone) { unsigned long migrate_pfn = zone->zone_start_pfn; - unsigned long free_pfn = zone_end_pfn(zone); + unsigned long free_pfn = zone_end_pfn(zone) - 1; unsigned long reset_migrate = free_pfn; unsigned long reset_free = migrate_pfn; bool source_set = false; @@ -1363,7 +1370,7 @@ fast_isolate_freepages(struct compact_control *cc) count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ - order = -1; + order = cc->search_order + 1; page = NULL; } } @@ -160,8 +160,12 @@ retry: goto retry; } - if (flags & FOLL_GET) - get_page(page); + if (flags & FOLL_GET) { + if (unlikely(!try_get_page(page))) { + page = ERR_PTR(-ENOMEM); + goto out; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -298,7 +302,10 @@ retry_locked: if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { - get_page(page); + if (unlikely(!try_get_page(page))) { + spin_unlock(ptl); + return ERR_PTR(-ENOMEM); + } spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); @@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if (is_device_public_page(*page)) goto unmap; } - get_page(*page); + if (unlikely(!try_get_page(*page))) { + ret = -ENOMEM; + goto unmap; + } out: ret = 0; unmap: @@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } +/* + * Return the compund head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) @@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) + head = try_get_compound_head(page, 1); + if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { @@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pmd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pmd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pud_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pud_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pgd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pgd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 404acdcd0455..b6a34b32d8ac 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pmd_lock(mm, pmd); + if (!pmd_none(*pmd)) { + if (write) { + if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); + goto out_unlock; + } + entry = pmd_mkyoung(*pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + + goto out_unlock; + } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); @@ -766,11 +781,16 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); + +out_unlock: spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); } vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -821,6 +841,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pud_lock(mm, pud); + if (!pud_none(*pud)) { + if (write) { + if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + goto out_unlock; + } + entry = pud_mkyoung(*pud); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + if (pudp_set_access_flags(vma, addr, pud, entry, 1)) + update_mmu_cache_pud(vma, addr, pud); + } + goto out_unlock; + } + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); @@ -830,6 +864,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); + +out_unlock: spin_unlock(ptl); } @@ -1641,7 +1677,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1717,7 +1753,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97b1e0290c66..641cedfc8c0f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3353,7 +3353,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * This is a hugetlb vma, all the pte entries should point * to huge page. */ - tlb_remove_check_page_size_change(tlb, sz); + tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); /* @@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index f06ee820d356..08b43de2383b 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -2,11 +2,13 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_common.o := n UBSAN_SANITIZE_generic.o := n +UBSAN_SANITIZE_generic_report.o := n UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 @@ -14,6 +16,7 @@ CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) obj-$(CONFIG_KASAN) := common.o init.o report.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 80bbe62b16cd..36afcf64e016 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,6 +36,7 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> +#include <linux/uaccess.h> #include "kasan.h" #include "../slab.h" @@ -48,37 +49,28 @@ static inline int in_irqentry_text(unsigned long ptr) ptr < (unsigned long)&__softirqentry_text_end); } -static inline void filter_irq_stacks(struct stack_trace *trace) +static inline unsigned int filter_irq_stacks(unsigned long *entries, + unsigned int nr_entries) { - int i; + unsigned int i; - if (!trace->nr_entries) - return; - for (i = 0; i < trace->nr_entries; i++) - if (in_irqentry_text(trace->entries[i])) { + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { /* Include the irqentry function into the stack. */ - trace->nr_entries = i + 1; - break; + return i + 1; } + } + return nr_entries; } static inline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = KASAN_STACK_DEPTH, - .skip = 0 - }; + unsigned int nr_entries; - save_stack_trace(&trace); - filter_irq_stacks(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; - - return depot_save_stack(&trace, flags); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + nr_entries = filter_irq_stacks(entries, nr_entries); + return stack_depot_save(entries, nr_entries, flags); } static inline void set_track(struct kasan_track *track, gfp_t flags) @@ -614,6 +606,15 @@ void kasan_free_shadow(const struct vm_struct *vm) vfree(kasan_mem_to_shadow(vm->addr)); } +extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); + +void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +{ + unsigned long flags = user_access_save(); + __kasan_report(addr, size, is_write, ip); + user_access_restore(flags); +} + #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ca9418fe9232..03a443579386 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -100,10 +100,11 @@ static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { - struct stack_trace trace; + unsigned long *entries; + unsigned int nr_entries; - depot_fetch_stack(track->stack, &trace); - print_stack_trace(&trace, 0); + nr_entries = stack_depot_fetch(track->stack, &entries); + stack_trace_print(entries, nr_entries, 0); } else { pr_err("(stack is not available)\n"); } @@ -281,8 +282,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void kasan_report(unsigned long addr, size_t size, - bool is_write, unsigned long ip) +void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { struct kasan_access_info info; void *tagged_addr; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 707fa5579f66..e57bf810f798 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -410,11 +410,6 @@ static void print_unreferenced(struct seq_file *seq, */ static void dump_object_info(struct kmemleak_object *object) { - struct stack_trace trace; - - trace.nr_entries = object->trace_len; - trace.entries = object->trace; - pr_notice("Object 0x%08lx (size %zu):\n", object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", @@ -424,7 +419,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); - print_stack_trace(&trace, 4); + stack_trace_print(object->trace, object->trace_len, 4); } /* @@ -553,15 +548,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali */ static int __save_stack_trace(unsigned long *trace) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - - return stack_trace.nr_entries; + return stack_trace_save(trace, MAX_TRACE, 2); } /* @@ -1401,6 +1388,7 @@ static void scan_block(void *_start, void *_end, /* * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. */ +#ifdef CONFIG_SMP static void scan_large_block(void *start, void *end) { void *next; @@ -1412,6 +1400,7 @@ static void scan_large_block(void *start, void *end) cond_resched(); } } +#endif /* * Scan a memory block corresponding to a kmemleak_object. A condition is @@ -1529,11 +1518,6 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - /* data/bss scanning */ - scan_large_block(_sdata, _edata); - scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_ro_after_init, __end_ro_after_init); - #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) @@ -2024,13 +2008,8 @@ early_param("kmemleak", kmemleak_boot_config); static void __init print_log_trace(struct early_log *log) { - struct stack_trace trace; - - trace.nr_entries = log->trace_len; - trace.entries = log->trace; - pr_notice("Early log backtrace:\n"); - print_stack_trace(&trace, 2); + stack_trace_print(log->trace, log->trace_len, 2); } /* @@ -2071,6 +2050,17 @@ void __init kmemleak_init(void) } local_irq_restore(flags); + /* register the data/bss sections */ + create_object((unsigned long)_sdata, _edata - _sdata, + KMEMLEAK_GREY, GFP_ATOMIC); + create_object((unsigned long)__bss_start, __bss_stop - __bss_start, + KMEMLEAK_GREY, GFP_ATOMIC); + /* only register .data..ro_after_init if not within .data */ + if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + create_object((unsigned long)__start_ro_after_init, + __end_ro_after_init - __start_ro_after_init, + KMEMLEAK_GREY, GFP_ATOMIC); + /* * This is the point where tracking allocations is safe. Automatic * scanning is started during the late initcall. Add the early logged diff --git a/mm/madvise.c b/mm/madvise.c index 21a7881a2db4..bb3a4554d5d5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 532e0e2a4817..81a0d3914ec9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3882,6 +3882,22 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page(). + */ +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->stat[idx]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; + if (x < 0) + x = 0; + return x; +} + /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -3907,10 +3923,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; diff --git a/mm/memory.c b/mm/memory.c index ab650c21bccd..36aac6844662 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -356,7 +356,7 @@ void free_pgd_range(struct mmu_gather *tlb, * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1046,7 +1046,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; swp_entry_t entry; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1155,7 +1155,7 @@ again: */ if (force_flush) { force_flush = 0; - tlb_flush_mmu_free(tlb); + tlb_flush_mmu(tlb); if (addr != end) goto again; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0082d699be94..b236069ff0d8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -874,6 +874,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ mem = find_memory_block(__pfn_to_section(pfn)); nid = mem->nid; + put_device(&mem->dev); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); diff --git a/mm/mmap.c b/mm/mmap.c index 41eb48d9b527..bd7b9f293b39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index f2f03c655807..99740e1dd273 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -11,7 +11,7 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> -#ifdef HAVE_GENERIC_MMU_GATHER +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { @@ -41,35 +41,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } -void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - - /* Is it from 0 to ~0? */ - tlb->fullmm = !(start | (end+1)); - tlb->need_flush_all = 0; - tlb->local.next = NULL; - tlb->local.nr = 0; - tlb->local.max = ARRAY_SIZE(tlb->__pages); - tlb->active = &tlb->local; - tlb->batch_count = 0; - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; -#endif - tlb->page_size = 0; - - __tlb_reset_range(tlb); -} - -void tlb_flush_mmu_free(struct mmu_gather *tlb) +static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -77,31 +52,10 @@ void tlb_flush_mmu_free(struct mmu_gather *tlb) tlb->active = &tlb->local; } -void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) +static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; - if (force) { - __tlb_reset_range(tlb); - __tlb_adjust_range(tlb, start, end - start); - } - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); @@ -109,19 +63,15 @@ void arch_tlb_finish_mmu(struct mmu_gather *tlb, tlb->local.next = NULL; } -/* __tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while - * handling the additional races in SMP caused by other CPUs caching valid - * mappings in their TLBs. Returns the number of free page slots left. - * When out of page slots we must call tlb_flush_mmu(). - *returns true if the caller should flush. - */ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); +#endif batch = tlb->active; /* @@ -139,7 +89,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ return false; } -#endif /* HAVE_GENERIC_MMU_GATHER */ +#endif /* HAVE_MMU_GATHER_NO_GATHER */ #ifdef CONFIG_HAVE_RCU_TABLE_FREE @@ -152,7 +102,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { -#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE +#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE /* * Invalidate page-table caches used by hardware walkers. Then we still * need to RCU-sched wait while freeing the pages because software @@ -193,7 +143,7 @@ static void tlb_remove_table_rcu(struct rcu_head *head) free_page((unsigned long)batch); } -void tlb_table_flush(struct mmu_gather *tlb) +static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; @@ -225,6 +175,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb_table_flush(tlb); +#endif +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb_batch_pages_flush(tlb); +#endif +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize @@ -240,10 +206,40 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { - arch_tlb_gather_mmu(tlb, mm, start, end); + tlb->mm = mm; + + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); + +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb->need_flush_all = 0; + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + tlb->batch_count = 0; +#endif + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb->batch = NULL; +#endif +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE + tlb->page_size = 0; +#endif + + __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } +/** + * tlb_finish_mmu - finish an mmu_gather structure + * @tlb: the mmu_gather structure to finish + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called at the end of the shootdown operation to free up any resources that + * were required. + */ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { @@ -254,8 +250,17 @@ void tlb_finish_mmu(struct mmu_gather *tlb, * the TLB by observing pte_none|!pte_dirty, for example so flush TLB * forcefully if we detect parallel PTE batching threads. */ - bool force = mm_tlb_flush_nested(tlb->mm); + if (mm_tlb_flush_nested(tlb->mm)) { + __tlb_reset_range(tlb); + __tlb_adjust_range(tlb, start, end - start); + } - arch_tlb_finish_mmu(tlb, start, end, force); + tlb_flush_mmu(tlb); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb_batch_list_free(tlb); +#endif dec_tlb_flush_pending(tlb->mm); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d96ca5bc555b..59661106da16 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -266,7 +266,20 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_DISCONTIGMEM +/* + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges + * are not on separate NUMA nodes. Functionally this works but with + * watermark_boost_factor, it can reclaim prematurely as the ranges can be + * quite small. By default, do not boost watermarks on discontigmem as in + * many cases very high-order allocations like THP are likely to be + * unsupported and the premature reclaim offsets the advantage of long-term + * fragmentation avoidance. + */ +int watermark_boost_factor __read_mostly; +#else int watermark_boost_factor __read_mostly = 15000; +#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; @@ -1131,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page, } arch_free_page(page, order); kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); + if (debug_pagealloc_enabled()) + kernel_map_pages(page, 1 << order, 0); + kasan_free_nondeferred_pages(page, order); return true; @@ -2001,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); + if (debug_pagealloc_enabled()) + kernel_map_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); @@ -3419,8 +3435,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) alloc_flags |= ALLOC_KSWAPD; #ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + if (zone_idx(zone) != ZONE_NORMAL) - goto out; + return alloc_flags; /* * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and @@ -3429,9 +3448,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) */ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); if (nr_online_nodes > 1 && !populated_zone(--zone)) - goto out; + return alloc_flags; -out: + alloc_flags |= ALLOC_NOFRAGMENT; #endif /* CONFIG_ZONE_DMA32 */ return alloc_flags; } @@ -3773,11 +3792,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) { - WARN_ON_ONCE(page); - return NULL; - } - /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall @@ -8005,7 +8019,10 @@ void *__init alloc_large_system_hash(const char *tablename, bool has_unmovable_pages(struct zone *zone, struct page *page, int count, int migratetype, int flags) { - unsigned long pfn, iter, found; + unsigned long found; + unsigned long iter = 0; + unsigned long pfn = page_to_pfn(page); + const char *reason = "unmovable page"; /* * TODO we could make this much more efficient by not checking every @@ -8015,17 +8032,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * can still lead to having bootmem allocations in zone_movable. */ - /* - * CMA allocations (alloc_contig_range) really need to mark isolate - * CMA pageblocks even when they are not movable in fact so consider - * them movable here. - */ - if (is_migrate_cma(migratetype) && - is_migrate_cma(get_pageblock_migratetype(page))) - return false; + if (is_migrate_cma_page(page)) { + /* + * CMA allocations (alloc_contig_range) really need to mark + * isolate CMA pageblocks even when they are not movable in fact + * so consider them movable here. + */ + if (is_migrate_cma(migratetype)) + return false; + + reason = "CMA page"; + goto unmovable; + } - pfn = page_to_pfn(page); - for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + for (found = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; if (!pfn_valid_within(check)) @@ -8105,7 +8125,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); if (flags & REPORT_FAILURE) - dump_page(pfn_to_page(pfn+iter), "unmovable page"); + dump_page(pfn_to_page(pfn + iter), reason); return true; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 925b6f44a444..addcbb2ae4e4 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -58,15 +58,10 @@ static bool need_page_owner(void) static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; - struct stack_trace dummy; + unsigned int nr_entries; - dummy.nr_entries = 0; - dummy.max_entries = ARRAY_SIZE(entries); - dummy.entries = &entries[0]; - dummy.skip = 0; - - save_stack_trace(&dummy); - return depot_save_stack(&dummy, GFP_KERNEL); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + return stack_depot_save(entries, nr_entries, GFP_KERNEL); } static noinline void register_dummy_stack(void) @@ -120,49 +115,39 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -static inline bool check_recursive_alloc(struct stack_trace *trace, - unsigned long ip) +static inline bool check_recursive_alloc(unsigned long *entries, + unsigned int nr_entries, + unsigned long ip) { - int i; - - if (!trace->nr_entries) - return false; + unsigned int i; - for (i = 0; i < trace->nr_entries; i++) { - if (trace->entries[i] == ip) + for (i = 0; i < nr_entries; i++) { + if (entries[i] == ip) return true; } - return false; } static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 2 - }; depot_stack_handle_t handle; + unsigned int nr_entries; - save_stack_trace(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); /* - * We need to check recursion here because our request to stackdepot - * could trigger memory allocation to save new entry. New memory - * allocation would reach here and call depot_save_stack() again - * if we don't catch it. There is still not enough memory in stackdepot - * so it would try to allocate memory again and loop forever. + * We need to check recursion here because our request to + * stackdepot could trigger memory allocation to save new + * entry. New memory allocation would reach here and call + * stack_depot_save_entries() again if we don't catch it. There is + * still not enough memory in stackdepot so it would try to + * allocate memory again and loop forever. */ - if (check_recursive_alloc(&trace, _RET_IP_)) + if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) return dummy_handle; - handle = depot_save_stack(&trace, flags); + handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; @@ -340,16 +325,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { - int ret; - int pageblock_mt, page_mt; + int ret, pageblock_mt, page_mt; + unsigned long *entries; + unsigned int nr_entries; char *kbuf; - unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 0 - }; count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); @@ -378,8 +357,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - depot_fetch_stack(handle, &trace); - ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); + nr_entries = stack_depot_fetch(handle, &entries); + ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0); if (ret >= count) goto err; @@ -410,14 +389,9 @@ void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; - unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 0 - }; depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; gfp_t gfp_mask; int mt; @@ -441,10 +415,10 @@ void __dump_page_owner(struct page *page) return; } - depot_fetch_stack(handle, &trace); + nr_entries = stack_depot_fetch(handle, &entries); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); - print_stack_trace(&trace, 0); + stack_trace_print(entries, nr_entries, 0); if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", diff --git a/mm/percpu.c b/mm/percpu.c index 2e6fc8d552c9..68dd2e7e73b5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2567,8 +2567,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, ai->groups[group].base_offset = areas[group] - base; } - pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); @@ -2692,8 +2692,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, ai->static_size, + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); diff --git a/mm/shmem.c b/mm/shmem.c index b3db3779a30a..2275a0ff7c30 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode) } spin_unlock(&sbinfo->shrinklist_lock); } - if (!list_empty(&info->swaplist)) { + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); mutex_lock(&shmem_swaplist_mutex); - list_del_init(&info->swaplist); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } } @@ -1099,10 +1104,11 @@ extern struct swap_info_struct *swap_info[]; static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, - bool frontswap) + unsigned int type, bool frontswap) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; + swp_entry_t entry; unsigned int ret = 0; if (!nr_entries) @@ -1116,13 +1122,12 @@ static int shmem_find_swap_entries(struct address_space *mapping, if (!xa_is_value(page)) continue; - if (frontswap) { - swp_entry_t entry = radix_to_swp_entry(page); - - if (!frontswap_test(swap_info[swp_type(entry)], - swp_offset(entry))) - continue; - } + entry = radix_to_swp_entry(page); + if (swp_type(entry) != type) + continue; + if (frontswap && + !frontswap_test(swap_info[type], swp_offset(entry))) + continue; indices[ret] = xas.xa_index; entries[ret] = page; @@ -1194,7 +1199,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, pvec.pages, indices, - frontswap); + type, frontswap); if (pvec.nr == 0) { ret = 0; break; @@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) { struct shmem_inode_info *info, *next; - struct inode *inode; - struct inode *prev_inode = NULL; int error = 0; if (list_empty(&shmem_swaplist)) return 0; mutex_lock(&shmem_swaplist_mutex); - - /* - * The extra refcount on the inode is necessary to safely dereference - * p->next after re-acquiring the lock. New shmem inodes with swap - * get added to the end of the list and we will scan them all. - */ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } - - inode = igrab(&info->vfs_inode); - if (!inode) - continue; - + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - prev_inode = inode; - error = shmem_unuse_inode(inode, type, frontswap, + error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, fs_pages_to_unuse); cond_resched(); @@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap, next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); if (error) break; } mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - return error; } @@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->shrinklist); diff --git a/mm/slab.c b/mm/slab.c index 329bfe67f2ca..284ab737faee 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) } #ifdef CONFIG_DEBUG_PAGEALLOC -static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, - unsigned long caller) -{ - int size = cachep->object_size; - - addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; - - if (size < 5 * sizeof(unsigned long)) - return; - - *addr++ = 0x12345678; - *addr++ = caller; - *addr++ = smp_processor_id(); - size -= 3 * sizeof(unsigned long); - { - unsigned long *sptr = &caller; - unsigned long svalue; - - while (!kstack_end(sptr)) { - svalue = *sptr++; - if (kernel_text_address(svalue)) { - *addr++ = svalue; - size -= sizeof(unsigned long); - if (size <= sizeof(unsigned long)) - break; - } - } - - } - *addr++ = 0x87654321; -} - -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) { if (!is_debug_pagealloc_cache(cachep)) return; - if (caller) - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) {} + int map) {} #endif @@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2374,7 +2338,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2434,7 +2397,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, 0); + slab_kernel_map(cachep, objp, 0); } } #endif @@ -2813,7 +2776,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, caller); + slab_kernel_map(cachep, objp, 0); } return objp; } @@ -3077,7 +3040,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -4308,7 +4271,8 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, + root_caches_node); struct page *page; struct kmem_cache_node *n; const char *name; diff --git a/mm/slub.c b/mm/slub.c index d30ede89f4a6..6b28cd2b5a58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -552,31 +552,22 @@ static void set_track(struct kmem_cache *s, void *object, if (addr) { #ifdef CONFIG_STACKTRACE - struct stack_trace trace; - int i; + unsigned int nr_entries; - trace.nr_entries = 0; - trace.max_entries = TRACK_ADDRS_COUNT; - trace.entries = p->addrs; - trace.skip = 3; metadata_access_enable(); - save_stack_trace(&trace); + nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3); metadata_access_disable(); - /* See rant in lockdep.c */ - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries - 1] == ULONG_MAX) - trace.nr_entries--; - - for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) - p->addrs[i] = 0; + if (nr_entries < TRACK_ADDRS_COUNT) + p->addrs[nr_entries] = 0; #endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; p->when = jiffies; - } else + } else { memset(p, 0, sizeof(struct track)); + } } static void init_tracking(struct kmem_cache *s, void *object) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2b8d9c3fbb47..cf63b5f01adf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2023,7 +2023,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { @@ -2035,7 +2034,6 @@ int try_to_unuse(unsigned int type, bool frontswap, struct page *page; swp_entry_t entry; unsigned int i; - int retries = 0; if (!si->inuse_pages) return 0; @@ -2053,11 +2051,9 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while ((p = p->next) != &init_mm.mmlist) { - if (signal_pending(current)) { - retval = -EINTR; - break; - } + while (si->inuse_pages && + !signal_pending(current) && + (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) @@ -2084,7 +2080,9 @@ retry: mmput(prev_mm); i = 0; - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + while (si->inuse_pages && + !signal_pending(current) && + (i = find_next_to_unuse(si, i, frontswap)) != 0) { entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); @@ -2117,14 +2115,18 @@ retry: * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. - * Its not worth continuosuly retrying to unuse the swap in this case. - * So we try SWAP_UNUSE_MAX_TRIES times. + * + * Limit the number of retries? No: when mmget_not_zero() above fails, + * that mm is likely to be freeing swap from exit_mmap(), which proceeds + * at its own independent pace; and even shmem_writepage() could have + * been preempted after get_swap_page(), temporarily hiding that swap. + * It's easy and robust (though cpu-intensive) just to keep retrying. */ - if (++retries >= SWAP_UNUSE_MAX_TRIES) - retval = -EBUSY; - else if (si->inuse_pages) - goto retry; - + if (si->inuse_pages) { + if (!signal_pending(current)) + goto retry; + retval = -EINTR; + } out: return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } diff --git a/mm/util.c b/mm/util.c index d559bde497a9..43a2984bccaa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -204,7 +204,7 @@ EXPORT_SYMBOL(vmemdup_user); * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * - * Return: newly allocated copy of @s or %NULL in case of error + * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86ba6e74b50..e5e9e1fcac01 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> @@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size) spin_unlock(&vb->lock); } -/** - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer - * - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily - * to amortize TLB flushing overheads. What this means is that any page you - * have now, may, in a former life, have been mapped into kernel virtual - * address by the vmap layer and so there might be some CPUs with TLB entries - * still referencing that page (additional to the regular 1:1 kernel mapping). - * - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can - * be sure that none of the pages we have control over will have any aliases - * from the vmap layer. - */ -void vm_unmap_aliases(void) +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { - unsigned long start = ULONG_MAX, end = 0; int cpu; - int flush = 0; if (unlikely(!vmap_initialized)) return; @@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** @@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long start = ULONG_MAX, end = 0; + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int i; + + /* + * The below block can be removed when all architectures that have + * direct map permissions also have set_direct_map_() implementations. + * This is concerned with resetting the direct map any an vm alias with + * execute permissions, without leaving a RW+X window. + */ + if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + set_memory_nx(addr, area->nr_pages); + set_memory_rw(addr, area->nr_pages); + } + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i++) { + if (page_address(area->pages[i])) { + start = min(addr, start); + end = max(addr, end); + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, 1); + set_area_direct_map(area, set_direct_map_default_noflush); +} + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - remove_vm_area(addr); + vm_remove_mappings(area, deallocate_pages); + if (deallocate_pages) { int i; @@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node); */ void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) diff --git a/mm/vmscan.c b/mm/vmscan.c index a5ad0b35ab8e..a815f73ee4d5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2176,7 +2176,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct mem_cgroup *memcg, struct scan_control *sc, bool actual_reclaim) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; @@ -2197,16 +2196,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - /* * When refaults are being observed, it means a new workingset * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); if (file && actual_reclaim && lruvec->refaults != refaults) { inactive_ratio = 0; } else { @@ -2227,12 +2222,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc) + struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), - memcg, sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2332,7 +2325,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anonymous pages on the LRU in eligible zones. * Otherwise, the small LRU gets thrashed. */ - if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, false, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_ANON; @@ -2350,7 +2343,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, true, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2503,7 +2496,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, memcg, sc); + lruvec, sc); } } @@ -2570,7 +2563,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2969,12 +2962,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) unsigned long refaults; struct lruvec *lruvec; - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - lruvec = mem_cgroup_lruvec(pgdat, memcg); + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } @@ -3339,7 +3328,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); diff --git a/mm/vmstat.c b/mm/vmstat.c index 36b56f858f0f..a7d493366a65 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1274,13 +1274,8 @@ const char * const vmstat_text[] = { #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ |