diff options
49 files changed, 439 insertions, 468 deletions
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a5c2b1aa46b0..d6968d090d49 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -30,6 +30,7 @@ config ALPHA select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL select HAVE_MOD_ARCH_SPECIFIC + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 7b01ae4f3bc6..8c9850437e67 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -119,20 +119,12 @@ do_page_fault(unsigned long address, unsigned long mmcsr, flags |= FAULT_FLAG_USER; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* Ok, we have a good vm_area for this memory access, so we can handle it. */ - good_area: si_code = SEGV_ACCERR; if (cause < 0) { if (!(vma->vm_flags & VM_EXEC)) @@ -192,6 +184,7 @@ retry: bad_area: mmap_read_unlock(mm); + bad_area_nosemaphore: if (user_mode(regs)) goto do_sigsegv; diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ab6d701365bb..96cf8720bb93 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -41,6 +41,7 @@ config ARC select HAVE_PERF_EVENTS select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 5ca59a482632..f59e722d147f 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -113,15 +113,9 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (unlikely(address < vma->vm_start)) { - if (!(vma->vm_flags & VM_GROWSDOWN) || expand_stack(vma, address)) - goto bad_area; - } + goto bad_area_nosemaphore; /* * vm_area is good, now check permissions for this memory access @@ -161,6 +155,7 @@ retry: bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: /* * Major/minor page fault accounting * (in case of retry we only land here once) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index cef741bb03b3..d60b73d93e03 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -127,6 +127,7 @@ config ARM select HAVE_VIRT_CPU_ACCOUNTING_GEN select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE select OF_EARLY_FLATTREE if OF diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 83598649a094..fef62e4a9edd 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -235,37 +235,11 @@ static inline bool is_permission_fault(unsigned int fsr) return false; } -static vm_fault_t __kprobes -__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, - unsigned long vma_flags, struct pt_regs *regs) -{ - struct vm_area_struct *vma = find_vma(mm, addr); - if (unlikely(!vma)) - return VM_FAULT_BADMAP; - - if (unlikely(vma->vm_start > addr)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - return VM_FAULT_BADMAP; - if (addr < FIRST_USER_ADDRESS) - return VM_FAULT_BADMAP; - if (expand_stack(vma, addr)) - return VM_FAULT_BADMAP; - } - - /* - * ok, we have a good vm_area for this memory access, check the - * permissions on the VMA allow for the fault which occurred. - */ - if (!(vma->vm_flags & vma_flags)) - return VM_FAULT_BADACCESS; - - return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); -} - static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; int sig, code; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; @@ -304,31 +278,21 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) - goto no_context; retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->ARM_pc)) - goto no_context; -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto bad_area; } - fault = __do_page_fault(mm, addr, flags, vm_flags, regs); + /* + * ok, we have a good vm_area for this memory access, check the + * permissions on the VMA allow for the fault which occurred. + */ + if (!(vma->vm_flags & vm_flags)) + fault = VM_FAULT_BADACCESS; + else + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_lock because @@ -359,6 +323,7 @@ retry: if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; +bad_area: /* * If we are in kernel mode at this point, we * have no context to handle this fault with. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3ae5c03ded0e..595028bd9160 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -231,6 +231,7 @@ config ARM64 select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c60100791bcc..935f0a8911f9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -497,27 +497,14 @@ static void do_bad_area(unsigned long far, unsigned long esr, #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000) #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000) -static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr, +static vm_fault_t __do_page_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct pt_regs *regs) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (unlikely(!vma)) - return VM_FAULT_BADMAP; - /* * Ok, we have a good vm_area for this memory access, so we can handle * it. - */ - if (unlikely(vma->vm_start > addr)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - return VM_FAULT_BADMAP; - if (expand_stack(vma, addr)) - return VM_FAULT_BADMAP; - } - - /* * Check that the permissions on the VMA allow for the fault which * occurred. */ @@ -631,31 +618,15 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, } lock_mmap: #endif /* CONFIG_PER_VMA_LOCK */ - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->pc)) - goto no_context; + retry: - mmap_read_lock(mm); - } else { - /* - * The above mmap_read_trylock() might have succeeded in which - * case, we'll have missed the might_sleep() from down_read(). - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && !search_exception_tables(regs->pc)) { - mmap_read_unlock(mm); - goto no_context; - } -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto done; } - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs); + fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -674,9 +645,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif /* * Handle the "normal" (no error) case first. */ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 95f1e9bfd1c7..cf2a6fd7dff8 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -97,6 +97,7 @@ config CSKY select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU + select LOCK_MM_AND_FIND_VMA select MAY_HAVE_SPARSE_IRQ select MODULES_USE_ELF_RELA if MODULES select OF diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index e15f736cca4b..ae9781b7d92e 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -97,13 +97,12 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f BUG(); } -static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +static inline void bad_area_nosemaphore(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) { /* * Something tried to access memory that isn't in our memory map. * Fix it, but check if it's kernel or user first. */ - mmap_read_unlock(mm); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { do_trap(regs, SIGSEGV, code, addr); @@ -238,20 +237,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs) if (is_write(regs)) flags |= FAULT_FLAG_WRITE; retry: - mmap_read_lock(mm); - vma = find_vma(mm, addr); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, mm, code, addr); - return; - } - if (likely(vma->vm_start <= addr)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, mm, code, addr); - return; - } - if (unlikely(expand_stack(vma, addr))) { - bad_area(regs, mm, code, addr); + bad_area_nosemaphore(regs, mm, code, addr); return; } @@ -259,11 +247,11 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it. */ -good_area: code = SEGV_ACCERR; if (unlikely(access_error(regs, vma))) { - bad_area(regs, mm, code, addr); + mmap_read_unlock(mm); + bad_area_nosemaphore(regs, mm, code, addr); return; } diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 54eadf265178..6726f4941015 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -28,6 +28,7 @@ config HEXAGON select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES select ARCH_WANT_LD_ORPHAN_WARN diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 4b578d02fd01..7295ea3f8cc8 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -57,21 +57,10 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto bad_area; + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) + goto bad_area_nosemaphore; - if (vma->vm_start <= address) - goto good_area; - - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - - if (expand_stack(vma, address)) - goto bad_area; - -good_area: /* Address space is OK. Now check access rights. */ si_code = SEGV_ACCERR; @@ -143,6 +132,7 @@ good_area: bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 85c4d9ac8686..5458b52b4009 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -110,10 +110,12 @@ retry: * register backing store that needs to expand upwards, in * this case vma will be null, but prev_vma will ne non-null */ - if (( !vma && prev_vma ) || (address < vma->vm_start) ) - goto check_expansion; + if (( !vma && prev_vma ) || (address < vma->vm_start) ) { + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } - good_area: code = SEGV_ACCERR; /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ @@ -177,35 +179,9 @@ retry: mmap_read_unlock(mm); return; - check_expansion: - if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { - if (!vma) - goto bad_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; - } else { - vma = prev_vma; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - /* - * Since the register backing store is accessed sequentially, - * we disallow growing it by more than a page at a time. - */ - if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) - goto bad_area; - if (expand_upwards(vma, address)) - goto bad_area; - } - goto good_area; - bad_area: mmap_read_unlock(mm); + bad_area_nosemaphore: if ((isr & IA64_ISR_SP) || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) { diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index cbab4f9ca15c..32b9da4622ce 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -131,6 +131,7 @@ config LOONGARCH select HAVE_VIRT_CPU_ACCOUNTING_GEN if !SMP select IRQ_FORCED_THREADING select IRQ_LOONGARCH_CPU + select LOCK_MM_AND_FIND_VMA select MMU_GATHER_MERGE_VMAS if MMU select MODULES_USE_ELF_RELA if MODULES select NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index 449087bd589d..da5b6d518cdb 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -169,22 +169,18 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (!expand_stack(vma, address)) - goto good_area; + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) + goto bad_area_nosemaphore; + goto good_area; + /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: do_sigsegv(regs, write, address, si_code); return; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 228128e45c67..c290c5c0cfb9 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -105,8 +105,9 @@ retry: if (address + 256 < rdusp()) goto map_err; } - if (expand_stack(vma, address)) - goto map_err; + vma = expand_stack(mm, address); + if (!vma) + goto map_err_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so @@ -196,10 +197,12 @@ bus_err: goto send_sig; map_err: + mmap_read_unlock(mm); +map_err_nosemaphore: current->thread.signo = SIGSEGV; current->thread.code = SEGV_MAPERR; current->thread.faddr = address; - goto send_sig; + return send_fault_sig(regs); acc_err: current->thread.signo = SIGSEGV; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 687714db6f4d..d3c3c33b73a6 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -192,8 +192,9 @@ retry: && (kernel_mode(regs) || !store_updates_sp(regs))) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; good_area: code = SEGV_ACCERR; diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ada18f3be229..d49d5fc40021 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -92,6 +92,7 @@ config MIPS select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP select IRQ_FORCED_THREADING select ISA if EISA + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_REL if MODULES select MODULES_USE_ELF_RELA if MODULES && 64BIT select PERF_USE_VMALLOC diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index a27045f5a556..d7878208bd3f 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -99,21 +99,13 @@ static void __do_page_fault(struct pt_regs *regs, unsigned long write, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: si_code = SEGV_ACCERR; if (write) { diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index e5936417d3cd..d54464021a61 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,6 +16,7 @@ config NIOS2 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_KGDB select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index ca64eccea551..e3fa9c15181d 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -86,27 +86,14 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->ea)) - goto bad_area_nosemaphore; retry: - mmap_read_lock(mm); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; switch (cause) { diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 6734fee3134f..a9dcd4381d1a 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -127,8 +127,9 @@ retry: if (address + PAGE_SIZE < regs->sp) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 6941fdbf2517..6e894afa4249 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, retry: mmap_read_lock(mm); vma = find_vma_prev(mm, address, &prev_vma); - if (!vma || address < vma->vm_start) - goto check_expansion; + if (!vma || address < vma->vm_start) { + if (!prev || !(prev->vm_flags & VM_GROWSUP)) + goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } + /* * Ok, we have a good vm_area for this memory access. We still need to * check the access permissions. */ -good_area: - if ((vma->vm_flags & acc_type) != acc_type) goto bad_area; @@ -347,17 +351,13 @@ good_area: mmap_read_unlock(mm); return; -check_expansion: - vma = prev_vma; - if (vma && (expand_stack(vma, address) == 0)) - goto good_area; - /* * Something tried to access memory that isn't in our memory map.. */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { int signo, si_code; @@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs) { unsigned long insn = regs->iir; int breg, treg, xreg, val = 0; - struct vm_area_struct *vma, *prev_vma; + struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; unsigned long address; @@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs) /* Search for VMA */ address = regs->ior; mmap_read_lock(mm); - vma = find_vma_prev(mm, address, &prev_vma); + vma = vma_lookup(mm, address); mmap_read_unlock(mm); /* @@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs) */ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; if (vma - && address >= vma->vm_start && (vma->vm_flags & acc_type) == acc_type) val = 1; } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8eb6c5e9e4f8..395044b705a1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -277,6 +277,7 @@ config PPC select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN && MODULES + select LOCK_MM_AND_FIND_VMA select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 7c507fb48182..f49fd873df8d 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -33,19 +33,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (mm->pgd == NULL) return -EFAULT; - mmap_read_lock(mm); - ret = -EFAULT; - vma = find_vma(mm, ea); + vma = lock_mm_and_find_vma(mm, ea, NULL); if (!vma) - goto out_unlock; - - if (ea < vma->vm_start) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_unlock; - if (expand_stack(vma, ea)) - goto out_unlock; - } + return -EFAULT; + ret = -EFAULT; is_write = dsisr & DSISR_ISSTORE; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 531177a4ee08..5bfdf6ecfa96 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -84,11 +84,6 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) return __bad_area_nosemaphore(regs, address, si_code); } -static noinline int bad_area(struct pt_regs *regs, unsigned long address) -{ - return __bad_area(regs, address, SEGV_MAPERR); -} - static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma) { @@ -515,40 +510,12 @@ lock_mmap: * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * exceptions table. lock_mm_and_find_vma() handles that logic. */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!is_user && !search_exception_tables(regs->nip)) - return bad_area_nosemaphore(regs, address); - retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) - return bad_area(regs, address); - - if (unlikely(vma->vm_start > address)) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return bad_area(regs, address); - - if (unlikely(expand_stack(vma, address))) - return bad_area(regs, address); - } + return bad_area_nosemaphore(regs, address); if (unlikely(access_pkey_error(is_write, is_exec, (error_code & DSISR_KEYFAULT), vma))) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c69572fbe613..a08917f681af 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -127,6 +127,7 @@ config RISCV select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8685f85a7474..35a84ec69a9f 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -84,13 +84,13 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f BUG(); } -static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, unsigned long addr) +static inline void +bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) { /* * Something tried to access memory that isn't in our memory map. * Fix it, but check if it's kernel or user first. */ - mmap_read_unlock(mm); /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { do_trap(regs, SIGSEGV, code, addr); @@ -100,6 +100,15 @@ static inline void bad_area(struct pt_regs *regs, struct mm_struct *mm, int code no_context(regs, addr); } +static inline void +bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, + unsigned long addr) +{ + mmap_read_unlock(mm); + + bad_area_nosemaphore(regs, code, addr); +} + static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) { pgd_t *pgd, *pgd_k; @@ -287,23 +296,10 @@ void handle_page_fault(struct pt_regs *regs) else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; retry: - mmap_read_lock(mm); - vma = find_vma(mm, addr); + vma = lock_mm_and_find_vma(mm, addr, regs); if (unlikely(!vma)) { tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); - return; - } - if (likely(vma->vm_start <= addr)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); - return; - } - if (unlikely(expand_stack(vma, addr))) { - tsk->thread.bad_cause = cause; - bad_area(regs, mm, code, addr); + bad_area_nosemaphore(regs, code, addr); return; } @@ -311,7 +307,6 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it. */ -good_area: code = SEGV_ACCERR; if (unlikely(access_error(cause, vma))) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b65144c392b0..dbe8394234e2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -457,8 +457,9 @@ retry: if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; - if (expand_stack(vma, address)) - goto out_up; + vma = expand_stack(mm, address); + if (!vma) + goto out; } /* diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e339745f62a1..2b3ce4fd3956 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -60,6 +60,7 @@ config SUPERH select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select NEED_SG_DMA_LENGTH select NO_DMA if !MMU && !DMA_COHERENT diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index acd2f5e50bfc..06e6b4952924 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -439,21 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, } retry: - mmap_read_lock(mm); - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; - } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); - return; - } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -461,7 +449,6 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 6197b87f2b3b..49849790e66d 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -58,6 +58,7 @@ config SPARC32 select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 select HAVE_UID16 + select LOCK_MM_AND_FIND_VMA select OLD_SIGACTION select ZONE_DMA diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 179295b14664..a3ccc0267bc2 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -143,28 +143,19 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, if (pagefault_disabled() || !mm) goto no_context; + if (!from_user && address >= PAGE_OFFSET) + goto no_context; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - - if (!from_user && address >= PAGE_OFFSET) - goto bad_area; - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) @@ -321,17 +312,9 @@ static void force_user_fault(unsigned long address, int write) code = SEGV_MAPERR; - mmap_read_lock(mm); - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; -good_area: + goto bad_area_nosemaphore; code = SEGV_ACCERR; if (write) { if (!(vma->vm_flags & VM_WRITE)) @@ -350,6 +333,7 @@ good_area: return; bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: __do_fault_siginfo(code, SIGSEGV, tsk->thread.kregs, address); return; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index d8a407fbe350..e326caf708c6 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -386,8 +386,9 @@ continue_fault: goto bad_area; } } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -490,8 +491,9 @@ exit_exception: * Fix it, but check if it's kernel or user first.. */ bad_area: - insn = get_fault_insn(regs, insn); mmap_read_unlock(mm); +bad_area_nosemaphore: + insn = get_fault_insn(regs, insn); handle_kernel_fault: do_kernel_fault(regs, si_code, fault_code, insn, address); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index d3ce21c4ca32..6d8ae86ae978 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -47,14 +47,15 @@ retry: vma = find_vma(mm, address); if (!vma) goto out; - else if (vma->vm_start <= address) + if (vma->vm_start <= address) goto good_area; - else if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - else if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - else if (expand_stack(vma, address)) + if (is_user && !ARCH_IS_STACKGROW(address)) goto out; + vma = expand_stack(mm, address); + if (!vma) + goto out_nosemaphore; good_area: *code_out = SEGV_ACCERR; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d5c69145a73c..b3ebf58cf77e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,6 +279,7 @@ config X86 select HOTPLUG_SMT if SMP select HOTPLUG_SPLIT_STARTUP if SMP && X86_32 select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4399983c50c..e8711b2cafaf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } -static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) -{ - __bad_area(regs, error_code, address, 0, SEGV_MAPERR); -} - static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { @@ -1366,51 +1360,10 @@ void do_user_addr_fault(struct pt_regs *regs, lock_mmap: #endif /* CONFIG_PER_VMA_LOCK */ - /* - * Kernel-mode access to the user address space should only occur - * on well-defined single instructions listed in the exception - * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_lock might deadlock attempting - * to validate the fault against the address space. - * - * Only do the expensive exception table search when we might be at - * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_lock, and - * 2. The access did not originate in userspace. - */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!user_mode(regs) && !search_exception_tables(regs->ip)) { - /* - * Fault from code in kernel from - * which we do not expect faults. - */ - bad_area_nosemaphore(regs, error_code, address); - return; - } retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); + vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; - } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); - return; - } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1418,7 +1371,6 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index c1bcfc2a8581..2a51a466779f 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -49,6 +49,7 @@ config XTENSA select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING_GEN select IRQ_DOMAIN + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC select TRACE_IRQFLAGS_SUPPORT diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index faf7cf35a0ee..d1eb8d6c5b82 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -130,23 +130,14 @@ void do_page_fault(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - + vma = lock_mm_and_find_vma(mm, address, regs); if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; + goto bad_area_nosemaphore; /* Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: code = SEGV_ACCERR; if (is_write) { @@ -205,6 +196,7 @@ good_area: */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { force_sig_fault(SIGSEGV, code, (void *) address); return; diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 864e4ffb6aa9..261352a23271 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -485,8 +485,8 @@ static void do_fault(struct work_struct *work) flags |= FAULT_FLAG_REMOTE; mmap_read_lock(mm); - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = vma_lookup(mm, address); + if (!vma) /* failed to get a vma in the right range */ goto out; diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 9821bc44f5ac..3ebd4b6586b3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) mmap_read_lock(mm); - vma = find_extend_vma(mm, prm->addr); + vma = vma_lookup(mm, prm->addr); if (!vma) /* Unmapped area */ goto out_put_mm; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 983ce34115d5..7b3d2d491407 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - if (mmap_read_lock_killable(mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma(mm, bprm->p); - mmap_read_unlock(mm); + vma = find_extend_vma_locked(mm, bprm->p); + mmap_write_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index 25c65b64544b..80b8611c416d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -200,33 +200,39 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; + struct vm_area_struct *vma = bprm->vma; + struct mm_struct *mm = bprm->mm; int ret; - unsigned int gup_flags = 0; -#ifdef CONFIG_STACK_GROWSUP - if (write) { - ret = expand_downwards(bprm->vma, pos); - if (ret < 0) + /* + * Avoid relying on expanding the stack down in GUP (which + * does not work for STACK_GROWSUP anyway), and just do it + * by hand ahead of time. + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); + ret = expand_downwards(vma, pos); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); return NULL; - } -#endif - - if (write) - gup_flags |= FOLL_WRITE; + } + mmap_write_downgrade(mm); + } else + mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process - * doing the exec and bprm->mm is the new process's mm. + * doing the exec and 'mm' is the new process's mm. */ - mmap_read_lock(bprm->mm); - ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, + ret = get_user_pages_remote(mm, pos, 1, + write ? FOLL_WRITE : 0, &page, NULL); - mmap_read_unlock(bprm->mm); + mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) - acct_arg_size(bprm, vma_pages(bprm->vma)); + acct_arg_size(bprm, vma_pages(vma)); return page; } @@ -853,7 +859,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack(vma, stack_base); + ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/include/linux/mm.h b/include/linux/mm.h index eef34f6a0351..39aa409e84d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2334,6 +2334,8 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long address, struct pt_regs *regs); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, @@ -3228,16 +3230,11 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); +struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -extern int expand_downwards(struct vm_area_struct *vma, - unsigned long address); -#if VM_GROWSUP -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); -#else - #define expand_upwards(vma, address) (0) -#endif +int expand_downwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); @@ -3332,7 +3329,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, + unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/Kconfig b/mm/Kconfig index 12f32f8d26bf..e00df648c91d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1222,6 +1222,10 @@ config PER_VMA_LOCK This feature allows locking each virtual memory area separately when handling page faults instead of taking mmap_lock. +config LOCK_MM_AND_FIND_VMA + bool + depends on !STACK_GROWSUP + source "mm/damon/Kconfig" endmenu @@ -1168,7 +1168,11 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { - vma = find_extend_vma(mm, start); + vma = find_vma(mm, start); + if (vma && (start < vma->vm_start)) { + WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); + vma = NULL; + } if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, @@ -1333,9 +1337,13 @@ int fixup_user_fault(struct mm_struct *mm, fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = find_vma(mm, address); + if (!vma) + return -EFAULT; + if (address < vma->vm_start ) { + WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN); return -EFAULT; + } if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; diff --git a/mm/memory.c b/mm/memory.c index 58029fd5adda..d8a9a770b1f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5245,6 +5245,125 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + /* Even if this succeeds, make it clear we *might* have slept */ + if (likely(mmap_read_trylock(mm))) { + might_sleep(); + return true; + } + + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif + #ifdef CONFIG_PER_VMA_LOCK /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be @@ -5584,25 +5703,32 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, gup_flags, &vma); if (IS_ERR_OR_NULL(page)) { -#ifndef CONFIG_HAVE_IOREMAP_PROT - break; -#else - int res = 0; + /* We might need to expand the stack to access it */ + vma = vma_lookup(mm, addr); + if (!vma) { + vma = expand_stack(mm, addr); + + /* mmap_lock was dropped on failure */ + if (!vma) + return buf - old_buf; + + /* Try again if stack expansion worked */ + continue; + } + /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = vma_lookup(mm, addr); - if (!vma) - break; + bytes = 0; +#ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) - res = vma->vm_ops->access(vma, addr, buf, - len, write); - if (res <= 0) - break; - bytes = res; + bytes = vma->vm_ops->access(vma, addr, buf, + len, write); #endif + if (bytes <= 0) + break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); diff --git a/mm/mmap.c b/mm/mmap.c index 8f1000bc35df..9b5188b65800 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1948,7 +1948,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_upwards(struct vm_area_struct *vma, unsigned long address) +static int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; @@ -2040,6 +2040,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { @@ -2048,16 +2049,20 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *prev; int error = 0; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + address &= PAGE_MASK; - if (address < mmap_min_addr) + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ - if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev)) { - if (address - prev->vm_end < stack_guard_gap) + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } @@ -2137,13 +2142,12 @@ static int __init cmdline_parse_stack_guard_gap(char *p) __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; @@ -2151,20 +2155,23 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (!prev) + return NULL; + if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; return expand_downwards(vma, address); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; @@ -2175,10 +2182,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return NULL; if (vma->vm_start <= addr) return vma; - if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; start = vma->vm_start; - if (expand_stack(vma, addr)) + if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); @@ -2186,7 +2191,91 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) } #endif -EXPORT_SYMBOL_GPL(find_extend_vma); +/* + * IA64 has some horrid mapping rules: it can expand both up and down, + * but with various special rules. + * + * We'll get rid of this architecture eventually, so the ugliness is + * temporary. + */ +#ifdef CONFIG_IA64 +static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) +{ + return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && + REGION_OFFSET(addr) < RGN_MAP_LIMIT; +} + +/* + * IA64 stacks grow down, but there's a special register backing store + * that can grow up. Only sequentially, though, so the new address must + * match vm_end. + */ +static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + if (vma->vm_end != (addr & PAGE_MASK)) + return -EFAULT; + return expand_upwards(vma, addr); +} + +static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + return expand_downwards(vma, addr); +} + +#elif defined(CONFIG_STACK_GROWSUP) + +#define vma_expand_up(vma,addr) expand_upwards(vma, addr) +#define vma_expand_down(vma, addr) (-EFAULT) + +#else + +#define vma_expand_up(vma,addr) (-EFAULT) +#define vma_expand_down(vma, addr) expand_downwards(vma, addr) + +#endif + +/* + * expand_stack(): legacy interface for page faulting. Don't use unless + * you have to. + * + * This is called with the mm locked for reading, drops the lock, takes + * the lock for writing, tries to look up a vma again, expands it if + * necessary, and downgrades the lock to reading again. + * + * If no vma is found or it can't be expanded, it returns NULL and has + * dropped the lock. + */ +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) + return NULL; + + vma = find_vma_prev(mm, addr, &prev); + if (vma && vma->vm_start <= addr) + goto success; + + if (prev && !vma_expand_up(prev, addr)) { + vma = prev; + goto success; + } + + if (vma && !vma_expand_down(vma, addr)) + goto success; + + mmap_write_unlock(mm); + return NULL; + +success: + mmap_write_downgrade(mm); + return vma; +} /* * Ok - we have the memory areas we should free on a maple tree so release them, diff --git a/mm/nommu.c b/mm/nommu.c index f670d9979a26..37d0b03143f1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -631,23 +631,20 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions - */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - return find_vma(mm, addr); -} - -/* * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) { return -ENOMEM; } +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_unlock(mm); + return NULL; +} + /* * look up the first VMA exactly that exactly matches addr * - should be called with mm->mmap_lock at least held readlocked |