diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-08-19 02:06:33 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-08-19 02:06:33 +0300 |
commit | 58d4e450a490d5f02183f6834c12550ba26d3b47 (patch) | |
tree | 4c1ada2d3cb98f5fb8c546a0c95fc28a3f733b83 | |
parent | cc28fcdc017e553375c999ca12107ceb27f34ab3 (diff) | |
parent | c715b72c1ba406f133217b509044c38d8e714a37 (diff) | |
download | linux-58d4e450a490d5f02183f6834c12550ba26d3b47.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton:
"14 fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes
mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM
mm/mempolicy: fix use after free when calling get_mempolicy
mm/cma_debug.c: fix stack corruption due to sprintf usage
signal: don't remove SIGNAL_UNKILLABLE for traced tasks.
mm, oom: fix potential data corruption when oom_reaper races with writer
mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS
slub: fix per memcg cache leak on css offline
mm: discard memblock data later
test_kmod: fix description for -s -and -c parameters
kmod: fix wait on recursive loop
wait: add wait_event_killable_timeout()
kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog
mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
-rw-r--r-- | arch/arm64/include/asm/elf.h | 4 | ||||
-rw-r--r-- | arch/powerpc/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/elf.h | 4 | ||||
-rw-r--r-- | include/linux/memblock.h | 6 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 10 | ||||
-rw-r--r-- | include/linux/oom.h | 22 | ||||
-rw-r--r-- | include/linux/wait.h | 37 | ||||
-rw-r--r-- | kernel/kmod.c | 25 | ||||
-rw-r--r-- | kernel/signal.c | 6 | ||||
-rw-r--r-- | mm/cma_debug.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 30 | ||||
-rw-r--r-- | mm/memblock.c | 38 | ||||
-rw-r--r-- | mm/memcontrol.c | 43 | ||||
-rw-r--r-- | mm/memory.c | 36 | ||||
-rw-r--r-- | mm/mempolicy.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 16 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 13 | ||||
-rwxr-xr-x | tools/testing/selftests/kmod/kmod.sh | 4 |
22 files changed, 224 insertions, 103 deletions
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index acae781f7359..3288c2b36731 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -114,10 +114,10 @@ /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#define ELF_ET_DYN_BASE 0x100000000UL +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 36f858c37ca7..81b0031f909f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -199,7 +199,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..29a1bf85e507 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -163,7 +163,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1c18d83d3f09..9aeb91935ce0 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -247,11 +247,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77d427974f57..bae11c7e7bf3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -61,6 +61,7 @@ extern int memblock_debug; #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata +void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock @@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); -phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); -phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); @@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, phys_addr_t *out_end); +void __memblock_free_early(phys_addr_t base, phys_addr_t size); +void __memblock_free_late(phys_addr_t base, phys_addr_t size); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3914e3dd6168..9b15a4bcfa77 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -void lock_page_memcg(struct page *page); +struct mem_cgroup *lock_page_memcg(struct page *page); +void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, @@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void lock_page_memcg(struct page *page) +static inline struct mem_cgroup *lock_page_memcg(struct page *page) +{ + return NULL; +} + +static inline void __unlock_page_memcg(struct mem_cgroup *memcg) { } diff --git a/include/linux/oom.h b/include/linux/oom.h index 8a266e2be5a6..76aac4ce39bc 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -6,6 +6,8 @@ #include <linux/types.h> #include <linux/nodemask.h> #include <uapi/linux/oom.h> +#include <linux/sched/coredump.h> /* MMF_* */ +#include <linux/mm.h> /* VM_FAULT* */ struct zonelist; struct notifier_block; @@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Checks whether a page fault on the given mm is still reliable. + * This is no longer true if the oom reaper started to reap the + * address space which is reflected by MMF_UNSTABLE flag set in + * the mm. At that moment any !shared mapping would lose the content + * and could cause a memory corruption (zero pages instead of the + * original content). + * + * User should call this before establishing a page table entry for + * a !shared mapping and under the proper page table lock. + * + * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise. + */ +static inline int check_stable_address_space(struct mm_struct *mm) +{ + if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + return VM_FAULT_SIGBUS; + return 0; +} + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/include/linux/wait.h b/include/linux/wait.h index 5b74e36c0ca8..dc19880c02f5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_killable_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_KILLABLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_KILLABLE) until the + * @condition evaluates to true or a kill signal is received. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a kill signal. + * + * Only kill signals interrupt this process. + */ +#define wait_event_killable_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_killable_timeout(wq_head, \ + condition, timeout); \ + __ret; \ +}) + #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + +/* modprobe_path is set via /proc/sys. */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 595b757bef72..c03ccbc405a0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) char name[16]; int u32s; - sprintf(name, "cma-%s", cma->name); + scnprintf(name, sizeof(name), "cma-%s", cma->name); tmp = debugfs_create_dir(name, cma_debugfs_root); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 216114f6ef0b..90731e3b7e58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> +#include <linux/oom.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + int ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto release; } clear_huge_page(page, haddr, HPAGE_PMD_NR); @@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - pte_free(vma->vm_mm, pgtable); + goto unlock_release; } else { pmd_t entry; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { int ret; @@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + mem_cgroup_cancel_charge(page, memcg, true); + put_page(page); + return ret; + } /* @@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = 0; set = false; if (pmd_none(*vmf->pmd)) { - if (userfaultfd_missing(vma)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) { + spin_unlock(vmf->ptl); + } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); diff --git a/mm/memblock.c b/mm/memblock.c index 2cb25fe4452c..bf14aea6ab70 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - -phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( - phys_addr_t *addr) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - *addr = __pa(memblock.reserved.regions); - - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.reserved.max); -} - -phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( - phys_addr_t *addr) +/** + * Discard memory and reserved arrays if they were allocated + */ +void __init memblock_discard(void) { - if (memblock.memory.regions == memblock_memory_init_regions) - return 0; + phys_addr_t addr, size; - *addr = __pa(memblock.memory.regions); + if (memblock.reserved.regions != memblock_reserved_init_regions) { + addr = __pa(memblock.reserved.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); + __memblock_free_late(addr, size); + } - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.memory.max); + if (memblock.memory.regions == memblock_memory_init_regions) { + addr = __pa(memblock.memory.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); + __memblock_free_late(addr, size); + } } - #endif /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3df3c04d73ab..e09741af816f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1611,9 +1611,13 @@ cleanup: * @page: the page * * This function protects unlocked LRU pages from being moved to - * another cgroup and stabilizes their page->mem_cgroup binding. + * another cgroup. + * + * It ensures lifetime of the returned memcg. Caller is responsible + * for the lifetime of the page; __unlock_page_memcg() is available + * when @page might get freed inside the locked section. */ -void lock_page_memcg(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page itself from being freed. E.g. writeback + * doesn't hold a page reference and relies on PG_writeback to + * keep off truncation, migration and so forth. + */ rcu_read_lock(); if (mem_cgroup_disabled()) - return; + return NULL; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return; + return NULL; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1649,18 +1659,18 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return; + return memcg; } EXPORT_SYMBOL(lock_page_memcg); /** - * unlock_page_memcg - unlock a page->mem_cgroup binding - * @page: the page + * __unlock_page_memcg - unlock and unpin a memcg + * @memcg: the memcg + * + * Unlock and unpin a memcg returned by lock_page_memcg(). */ -void unlock_page_memcg(struct page *page) +void __unlock_page_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = page->mem_cgroup; - if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page) rcu_read_unlock(); } + +/** + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page + */ +void unlock_page_memcg(struct page *page) +{ + __unlock_page_memcg(page->mem_cgroup); +} EXPORT_SYMBOL(unlock_page_memcg); /* diff --git a/mm/memory.c b/mm/memory.c index e158f7ac6730..fe2fba27ded2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,6 +68,7 @@ #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> +#include <linux/oom.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; + int ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto unlock; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!pte_none(*vmf->pte)) goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2978,7 +2987,7 @@ setpte: update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, int finish_fault(struct vm_fault *vmf) { struct page *page; - int ret; + int ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf) page = vmf->cow_page; else page = vmf->page; - ret = alloc_set_pte(vmf, vmf->memcg, page); + + /* + * check even for read faults because we might have lost our CoWed + * page + */ + if (!(vmf->vma->vm_flags & VM_SHARED)) + ret = check_stable_address_space(vmf->vma->vm_mm); + if (!ret) + ret = alloc_set_pte(vmf, vmf->memcg, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3900,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } - /* - * This mm has been already reaped by the oom reaper and so the - * refault cannot be trusted in general. Anonymous refaults would - * lose data and give a zero page instead e.g. This is especially - * problem for use_mm() because regular tasks will just die and - * the corrupted data will not be visible anywhere while kthread - * will outlive the oom victim and potentially propagate the date - * further. - */ - if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) - && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) - ret = VM_FAULT_SIGBUS; - return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d911fa5cb2a7..618ab125228b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy |= (pol->flags & MPOL_MODE_FLAGS); } - if (vma) { - up_read(¤t->mm->mmap_sem); - vma = NULL; - } - err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 36454d0f96ee..3637809a18d0 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void) NULL) count += __free_memory_core(start, end); -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - { - phys_addr_t size; - - /* Free memblock.reserved array if it was allocated */ - size = get_allocated_memblock_reserved_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - - /* Free memblock.memory array if it was allocated */ - size = get_allocated_memblock_memory_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - } -#endif - return count; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96e93b214d31..bf050ab025b7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - lock_page_memcg(page); + memcg = lock_page_memcg(page); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + /* + * NOTE: Page might be free now! Writeback doesn't hold a page + * reference on its own, it relies on truncation to wait for + * the clearing of PG_writeback. The below can only access + * page state that is static across allocation cycles. + */ if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); + dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - unlock_page_memcg(page); + __unlock_page_memcg(memcg); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d00f746c2fd..1bad301820c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); diff --git a/mm/slub.c b/mm/slub.c index 1d3f9835f4ea..e8b4e31162ca 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) * A cache is never shut down before deactivation is * complete, so no need to worry about synchronization. */ - return; + goto out; #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); +out: kobject_put(&s->kobj); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8698c1c86c4d..a47e3894c775 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? + 0 : + __GFP_HIGHMEM; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); @@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_page(alloc_mask|highmem_mask); else - page = alloc_pages_node(node, alloc_mask, 0); + page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask)) + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 8cecae9a8bca..7956ea3be667 100755 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -473,8 +473,8 @@ usage() echo " all Runs all tests (default)" echo " -t Run test ID the number amount of times is recommended" echo " -w Watch test ID run until it runs into an error" - echo " -c Run test ID once" - echo " -s Run test ID x test-count number of times" + echo " -s Run test ID once" + echo " -c Run test ID x test-count number of times" echo " -l List all test ID list" echo " -h|--help Help" echo |