diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/cma.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 7 | ||||
-rw-r--r-- | mm/frame_vector.c | 9 | ||||
-rw-r--r-- | mm/gup.c | 67 | ||||
-rw-r--r-- | mm/huge_memory.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 66 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 41 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 4 | ||||
-rw-r--r-- | mm/kasan/report.c | 3 | ||||
-rw-r--r-- | mm/khugepaged.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 8 | ||||
-rw-r--r-- | mm/list_lru.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 9 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 29 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/mremap.c | 34 | ||||
-rw-r--r-- | mm/nommu.c | 40 | ||||
-rw-r--r-- | mm/page_alloc.c | 135 | ||||
-rw-r--r-- | mm/process_vm_access.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 17 | ||||
-rw-r--r-- | mm/slab.c | 45 | ||||
-rw-r--r-- | mm/slab.h | 1 | ||||
-rw-r--r-- | mm/slab_common.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/util.c | 12 | ||||
-rw-r--r-- | mm/vmscan.c | 4 | ||||
-rw-r--r-- | mm/workingset.c | 2 |
33 files changed, 330 insertions, 280 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index be0ee11fa0d9..86e3e0e74d20 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on !KASAN + depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y @@ -385,6 +385,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); + if (bitmap_count > bitmap_maxno) + return NULL; + for (;;) { mutex_lock(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..50b52fe51937 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); @@ -1734,6 +1732,9 @@ find_page: if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iter->type & ITER_PIPE)) + goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; /* Did it get truncated before we got the lock? */ diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 381bb07ed14f..db77dcb38afd 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -11,10 +11,7 @@ * get_vaddr_frames() - map virtual addresses to pfns * @start: starting user address * @nr_frames: number of pages / pfns from start to map - * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. See description of the same argument of - get_user_pages(). + * @gup_flags: flags modifying lookup behaviour * @vec: structure which receives pages / pfns of the addresses mapped. * It should have space for at least nr_frames entries. * @@ -34,7 +31,7 @@ * This function takes care of grabbing mmap_sem as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, - bool write, bool force, struct frame_vector *vec) + unsigned int gup_flags, struct frame_vector *vec) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -59,7 +56,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, vec->got_ref = true; vec->is_pfns = false; ret = get_user_pages_locked(start, nr_frames, - write, force, (struct page **)(vec->ptrs), &locked); + gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -60,6 +60,16 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } +/* + * FOLL_FORCE can write to even unwritable pte's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +{ + return pte_write(pte) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -95,7 +105,7 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) { + if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } @@ -412,7 +422,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags &= ~FOLL_WRITE; + *flags |= FOLL_COW; return 0; } @@ -516,7 +526,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -621,7 +631,6 @@ next_page: } while (nr_pages); return i; } -EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { @@ -729,7 +738,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, struct vm_area_struct **vmas, int *locked, bool notify_drop, @@ -747,10 +755,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, if (pages) flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; pages_done = 0; lock_dropped = false; @@ -843,12 +847,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * up_read(&mm->mmap_sem); */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { return __get_user_pages_locked(current, current->mm, start, nr_pages, - write, force, pages, NULL, locked, true, - FOLL_TOUCH); + pages, NULL, locked, true, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); @@ -864,14 +868,14 @@ EXPORT_SYMBOL(get_user_pages_locked); */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; int locked = 1; + down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, &locked, false, gup_flags); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, NULL, + &locked, false, gup_flags); if (locked) up_read(&mm->mmap_sem); return ret; @@ -896,10 +900,10 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * "force" parameter). */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, FOLL_TOUCH); + pages, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -910,9 +914,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). + * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -941,9 +943,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual @@ -960,12 +962,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, - FOLL_TOUCH | FOLL_REMOTE); + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -976,12 +978,12 @@ EXPORT_SYMBOL(get_user_pages_remote); * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { return __get_user_pages_locked(current, current->mm, start, nr_pages, - write, force, pages, vmas, NULL, false, - FOLL_TOUCH); + pages, vmas, NULL, false, + gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -1505,7 +1507,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cdcd25cb30fe..d4a6e4001512 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,11 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd) + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; + bool force_flush = false; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || @@ -1456,6 +1457,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); + if (pmd_present(pmd) && pmd_dirty(pmd)) + force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl) && @@ -1467,6 +1470,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); if (new_ptl != old_ptl) spin_unlock(new_ptl); + if (force_flush) + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + else + *need_flush = true; spin_unlock(old_ptl); return true; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec49d9ef1eef..418bf01a50ed 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1826,11 +1826,17 @@ static void return_unused_surplus_pages(struct hstate *h, * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. + * + * vma_add_reservation is used in error paths where a reservation must + * be restored when a newly allocated huge page must be freed. It is + * to be called after calling vma_needs_reservation to determine if a + * reservation exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, + VMA_ADD_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -1856,6 +1862,14 @@ static long __vma_reservation_common(struct hstate *h, region_abort(resv, idx, idx + 1); ret = 0; break; + case VMA_ADD_RESV: + if (vma->vm_flags & VM_MAYSHARE) + ret = region_add(resv, idx, idx + 1); + else { + region_abort(resv, idx, idx + 1); + ret = region_del(resv, idx, idx + 1); + } + break; default: BUG(); } @@ -1903,6 +1917,56 @@ static void vma_end_reservation(struct hstate *h, (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } +static long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); +} + +/* + * This routine is called to restore a reservation on error paths. In the + * specific error paths, a huge page was allocated (via alloc_huge_page) + * and is about to be freed. If a reservation for the page existed, + * alloc_huge_page would have consumed the reservation and set PagePrivate + * in the newly allocated page. When the page is freed via free_huge_page, + * the global reservation count will be incremented if PagePrivate is set. + * However, free_huge_page can not adjust the reserve map. Adjust the + * reserve map here to be consistent with global reserve count adjustments + * to be made by free_huge_page. + */ +static void restore_reserve_on_error(struct hstate *h, + struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + if (unlikely(PagePrivate(page))) { + long rc = vma_needs_reservation(h, vma, address); + + if (unlikely(rc < 0)) { + /* + * Rare out of memory condition in reserve map + * manipulation. Clear PagePrivate so that + * global reserve count will not be incremented + * by free_huge_page. This will make it appear + * as though the reservation for this page was + * consumed. This may prevent the task from + * faulting in the page at a later time. This + * is better than inconsistent global huge page + * accounting of reserve counts. + */ + ClearPagePrivate(page); + } else if (rc) { + rc = vma_add_reservation(h, vma, address); + if (unlikely(rc < 0)) + /* + * See above comment about rare out of + * memory condition. + */ + ClearPagePrivate(page); + } else + vma_end_reservation(h, vma, address); + } +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -3498,6 +3562,7 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: + restore_reserve_on_error(h, vma, address, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -3680,6 +3745,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); + restore_reserve_on_error(h, vma, address, page); put_page(page); goto out; } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 88af13c00d3c..0e9505f66ec1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -34,6 +34,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> +#include <linux/bug.h> #include "kasan.h" #include "../slab.h" @@ -62,7 +63,7 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } -static void __kasan_unpoison_stack(struct task_struct *task, void *sp) +static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) { void *base = task_stack_page(task); size_t size = sp - base; @@ -77,9 +78,24 @@ void kasan_unpoison_task_stack(struct task_struct *task) } /* Unpoison the stack for the current task beyond a watermark sp value. */ -asmlinkage void kasan_unpoison_remaining_stack(void *sp) +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, sp); + __kasan_unpoison_stack(current, watermark); +} + +/* + * Clear all poison for the region between the current SP and a provided + * watermark value, as is sometimes required prior to hand-crafted asm function + * returns in the middle of functions. + */ +void kasan_unpoison_stack_above_sp_to(const void *watermark) +{ + const void *sp = __builtin_frame_address(0); + size_t size = watermark - sp; + + if (WARN_ON(sp > watermark)) + return; + kasan_unpoison_shadow(sp, size); } /* @@ -748,6 +764,25 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); +/* Emitted by compiler to poison large objects when they go out of scope. */ +void __asan_poison_stack_memory(const void *addr, size_t size) +{ + /* + * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded + * by redzones, so we simply round up size to simplify logic. + */ + kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), + KASAN_USE_AFTER_SCOPE); +} +EXPORT_SYMBOL(__asan_poison_stack_memory); + +/* Emitted by compiler to unpoison large objects when they go into scope. */ +void __asan_unpoison_stack_memory(const void *addr, size_t size) +{ + kasan_unpoison_shadow(addr, size); +} +EXPORT_SYMBOL(__asan_unpoison_stack_memory); + #ifdef CONFIG_MEMORY_HOTPLUG static int kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e5c2181fee6f..1c260e6b3b3c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -21,6 +21,7 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_USE_AFTER_SCOPE 0xF8 /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION @@ -53,6 +54,9 @@ struct kasan_global { #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif }; /** diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 24c1211fe9d5..073325aedc68 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -90,6 +90,9 @@ static void print_error_description(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; + case KASAN_USE_AFTER_SCOPE: + bug_type = "use-after-scope"; + break; } pr_err("BUG: KASAN: %s in %pS at addr %p\n", diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 728d7790dc2d..87e1a7ca3846 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -103,6 +103,7 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +#ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -295,6 +296,7 @@ struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; +#endif /* CONFIG_SYSFS */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a5e453cf05c4..d1380ed93fdf 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1414,6 +1414,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); + scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ @@ -1453,8 +1454,11 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL); + void *stack = try_get_task_stack(p); + if (stack) { + scan_block(stack, stack + THREAD_SIZE, NULL); + put_task_stack(p); + } } while_each_thread(g, p); read_unlock(&tasklist_lock); } diff --git a/mm/list_lru.c b/mm/list_lru.c index 1d05cb9d363d..234676e31edd 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -554,6 +554,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, err = memcg_init_list_lru(lru, memcg_aware); if (err) { kfree(lru->node); + /* Do this so a list_lru_destroy() doesn't crash: */ + lru->node = NULL; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae052b5e3315..0f870ba43942 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1917,6 +1917,15 @@ retry: current->flags & PF_EXITING)) goto force; + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + if (unlikely(task_in_memcg_oom(current))) goto nomem; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index de88f33519c0..19e796d36a62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1112,10 +1112,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(p); + if (!PageAnon(p) || unlikely(split_huge_page(p))) { + unlock_page(p); + if (!PageAnon(p)) pr_err("Memory failure: %#lx: non anonymous thp\n", pfn); else @@ -1126,9 +1126,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(p); - put_hwpoison_page(hpage); + unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } diff --git a/mm/memory.c b/mm/memory.c index fc1987dfd8cc..e18c57bdc75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3869,10 +3869,11 @@ EXPORT_SYMBOL_GPL(generic_access_phys); * given task for page fault accounting. */ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, int write) + unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; + int write = gup_flags & FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3882,7 +3883,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + gup_flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; @@ -3934,14 +3935,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, write); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* @@ -3950,7 +3951,7 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; @@ -3959,7 +3960,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + mmput(mm); return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..cad4b9125695 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2131,7 +2117,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long section_nr = pfn_to_section_nr(pfn); @@ -2158,20 +2143,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad1c96ac313c..0b859af06b87 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -850,7 +850,7 @@ static int lookup_node(unsigned long addr) struct page *p; int err; - err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); diff --git a/mm/mlock.c b/mm/mlock.c index 145a4258ddbc..cdbed8aaa426 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -190,10 +190,13 @@ unsigned int munlock_vma_page(struct page *page) */ spin_lock_irq(zone_lru_lock(zone)); - nr_pages = hpage_nr_pages(page); - if (!TestClearPageMlocked(page)) + if (!TestClearPageMlocked(page)) { + /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ + nr_pages = 1; goto unlock_out; + } + nr_pages = hpage_nr_pages(page); __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index bcdbe62f3e6d..11936526b08b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -25,7 +25,6 @@ #include <linux/perf_event.h> #include <linux/pkeys.h> #include <linux/ksm.h> -#include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> diff --git a/mm/mremap.c b/mm/mremap.c index da22ad2a5678..30d7d2482eea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks) + unsigned long new_addr, bool need_rmap_locks, bool *need_flush) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + bool force_flush = false; + unsigned long len = old_end - old_addr; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma @@ -146,7 +148,19 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; + pte = ptep_get_and_clear(mm, old_addr, old_pte); + /* + * If we are remapping a dirty PTE, make sure + * to flush TLB before we drop the PTL for the + * old PTE or we may race with page_mkclean(). + * + * This check has to be done after we removed the + * old PTE from page tables or another thread may + * dirty it after the check and before the removal. + */ + if (pte_present(pte) && pte_dirty(pte)) + force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); @@ -156,6 +170,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); + else + *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -201,13 +219,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd); + old_end, old_pmd, new_pmd, + &need_flush); if (need_rmap_locks) drop_rmap_locks(vma); - if (moved) { - need_flush = true; + if (moved) continue; - } } split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) @@ -220,11 +237,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, extent = next - new_addr; if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr, need_rmap_locks); - need_flush = true; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, + new_pmd, new_addr, need_rmap_locks, &need_flush); } - if (likely(need_flush)) + if (need_flush) flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); diff --git a/mm/nommu.c b/mm/nommu.c index 95daf81a4855..8b8faaf2a9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -109,7 +109,7 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) @@ -160,33 +160,25 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ long get_user_pages(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(current, current->mm, start, nr_pages, flags, - pages, vmas, NULL); + return __get_user_pages(current, current->mm, start, nr_pages, + gup_flags, pages, vmas, NULL); } EXPORT_SYMBOL(get_user_pages); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, int *locked) { - return get_user_pages(start, nr_pages, write, force, pages, NULL); + return get_user_pages(start, nr_pages, gup_flags, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - unsigned int gup_flags) + struct page **pages, unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -198,10 +190,10 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages) + struct page **pages, unsigned int gup_flags) { return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - write, force, pages, 0); + pages, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -1817,9 +1809,10 @@ void filemap_map_pages(struct fault_env *fe, EXPORT_SYMBOL(filemap_map_pages); static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, int write) + unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; + int write = gup_flags & FOLL_WRITE; down_read(&mm->mmap_sem); @@ -1854,21 +1847,22 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer - * @write: whether the access is a write + * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, write); + return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * - source/target buffer must be kernel space */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) { struct mm_struct *mm; @@ -1879,7 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, write); + len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..6de9440e3ae2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -volatile u64 latent_entropy __latent_entropy; +volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif @@ -3658,7 +3658,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { warn_alloc(gfp_mask, - "page alloction stalls for %ums, order:%u\n", + "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; } @@ -4224,7 +4224,7 @@ static void show_migration_types(unsigned char type) } *p = '\0'; - printk("(%s) ", tmp); + printk(KERN_CONT "(%s) ", tmp); } /* @@ -4335,7 +4335,8 @@ void show_free_areas(unsigned int filter) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); - printk("%s" + printk(KERN_CONT + "%s" " free:%lukB" " min:%lukB" " low:%lukB" @@ -4382,8 +4383,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %ld", zone->lowmem_reserve[i]); - printk("\n"); + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { @@ -4394,7 +4395,7 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); - printk("%s: ", zone->name); + printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -4412,11 +4413,12 @@ void show_free_areas(unsigned int filter) } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - printk("%lu*%lukB ", nr[order], K(1UL) << order); + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } - printk("= %lukB\n", K(total)); + printk(KERN_CONT "= %lukB\n", K(total)); } hugetlb_show_meminfo(); @@ -4977,72 +4979,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) } /* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -5304,49 +5240,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5260,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5272,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 07514d41ebcc..be8dc8d1edb9 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,12 +88,16 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); + unsigned int flags = FOLL_REMOTE; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + if (vm_write) + flags |= FOLL_WRITE; + while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); size_t bytes; @@ -104,8 +108,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ pages = __get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages, - FOLL_REMOTE); + process_pages, flags); if (pages <= 0) return -EFAULT; diff --git a/mm/shmem.c b/mm/shmem.c index ad7813d73ea7..9d32e1cb9f38 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1483,6 +1483,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); + __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1846,6 +1848,18 @@ unlock: return error; } +/* + * This is like autoremove_wake_function, but it removes the wait queue + * entry unconditionally - even if something else had already woken the + * target. + */ +static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + list_del_init(&wait->task_list); + return ret; +} + static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); @@ -1881,7 +1895,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT(shmem_fault_wait); + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && @@ -2663,6 +2677,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); spin_unlock(&inode->i_lock); error = 0; goto out; diff --git a/mm/slab.c b/mm/slab.c index 090fb26b3a39..0b0550ca85b4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -233,6 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; + parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -966,7 +967,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * guaranteed to be valid until irq is re-enabled, because it will be * freed after synchronize_sched(). */ - if (force_change) + if (old_shared && force_change) synchronize_sched(); fail: @@ -1382,24 +1383,27 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) { - active_objs += cachep->num; - active_slabs++; - } + num_slabs = n->num_slabs; list_for_each_entry(page, &n->slabs_partial, lru) { active_objs += page->active; - active_slabs++; + num_slabs_partial++; } list_for_each_entry(page, &n->slabs_free, lru) - num_slabs++; + num_slabs_free++; free_objects += n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - + (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); @@ -2314,6 +2318,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); + n->num_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2752,6 +2757,8 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) list_add_tail(&page->lru, &(n->slabs_free)); else fixup_slab_list(cachep, n, page, &list); + + n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -3443,6 +3450,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); + n->num_slabs--; } } @@ -4099,6 +4107,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) unsigned long num_objs; unsigned long active_slabs = 0; unsigned long num_slabs, free_objects = 0, shared_avail = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full = 0; const char *name; char *error = NULL; int node; @@ -4111,33 +4121,34 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) { - if (page->active != cachep->num && !error) - error = "slabs_full accounting error"; - active_objs += cachep->num; - active_slabs++; - } + num_slabs += n->num_slabs; + list_for_each_entry(page, &n->slabs_partial, lru) { if (page->active == cachep->num && !error) error = "slabs_partial accounting error"; if (!page->active && !error) error = "slabs_partial accounting error"; active_objs += page->active; - active_slabs++; + num_slabs_partial++; } + list_for_each_entry(page, &n->slabs_free, lru) { if (page->active && !error) error = "slabs_free accounting error"; - num_slabs++; + num_slabs_free++; } + free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + if (num_objs - active_objs != free_objects && !error) error = "free_objects accounting error"; diff --git a/mm/slab.h b/mm/slab.h index 9653f2e2591a..bc05fdc3edce 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -432,6 +432,7 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; + unsigned long num_slabs; unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 71f0b28a1bec..329b03843863 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -533,8 +533,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, s = create_cache(cache_name, root_cache->object_size, root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + root_cache->flags & CACHE_CREATE_MASK, + root_cache->ctor, memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/swapfile.c b/mm/swapfile.c index 2210de290b54..f30438970cd1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2224,6 +2224,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } diff --git a/mm/truncate.c b/mm/truncate.c index a01cce450a26..8d8c62d89e6d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -283,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -371,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -492,7 +492,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { @@ -612,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; diff --git a/mm/util.c b/mm/util.c index 662cddf914af..1a41553db866 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,8 +230,10 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) +int vma_is_stack_for_current(struct vm_area_struct *vma) { + struct task_struct * __maybe_unused t = current; + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } @@ -283,7 +285,8 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, pages, + write ? FOLL_WRITE : 0); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -623,7 +626,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen) len = buflen; - res = access_process_vm(task, arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then @@ -638,7 +641,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, - buffer+res, len, 0); + buffer+res, len, + FOLL_FORCE); res = strnlen(buffer, res); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 744f926af442..d75cdf360730 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2354,6 +2354,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc } } + cond_resched(); + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) continue; @@ -3043,7 +3045,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + current->flags |= PF_MEMALLOC; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); diff --git a/mm/workingset.c b/mm/workingset.c index 617475f529f4..fb1f9183d89a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -348,7 +348,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - if (memcg_kmem_enabled()) { + if (sc->memcg) { pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, LRU_ALL_FILE); } else { |