From 89f499f35c11af61ba7075ddc23209d10805a25a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:14 -0400 Subject: maple_tree: add format option to mt_dump() Allow different formatting strings to be used when dumping the tree. Currently supports hex and decimal. Link: https://lkml.kernel.org/r/20230518145544.1722059-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..04bcf3b3c720 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -301,7 +301,7 @@ out: #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt, enum mt_dump_format fmt); /* Validate the maple tree */ static void validate_mm_mt(struct mm_struct *mm) @@ -323,18 +323,18 @@ static void validate_mm_mt(struct mm_struct *mm) pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); if (vma_mt->vm_end != mas.last + 1) { pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", mm, vma_mt->vm_start, vma_mt->vm_end, mas.index, mas.last); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); } VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); if (vma_mt->vm_start != mas.index) { pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); } VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } -- cgit v1.2.3 From b50e195ff436625b26dcc9839bc52cc7c5bf1a54 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:26 -0400 Subject: mm: update validate_mm() to use vma iterator Use the vma iterator in the validation code and combine the code to check the maple tree into the main validate_mm() function. Introduce a new function vma_iter_dump_tree() to dump the maple tree in hex layout. Replace all calls to validate_mm_mt() with validate_mm(). [Liam.Howlett@oracle.com: update validate_mm() to use vma iterator CONFIG flag] Link: https://lkml.kernel.org/r/20230606183538.588190-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230518145544.1722059-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 14 ++++++++ mm/debug.c | 9 +++++ mm/internal.h | 3 +- mm/mmap.c | 94 +++++++++++++++++++------------------------------ 4 files changed, 61 insertions(+), 59 deletions(-) (limited to 'mm/mmap.c') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index b8728d11c949..7c3e7b0b0e8f 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -8,10 +8,12 @@ struct page; struct vm_area_struct; struct mm_struct; +struct vma_iterator; void dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) @@ -74,6 +76,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_MM(cond, mm) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_mm(mm); \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -90,6 +103,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/mm/debug.c b/mm/debug.c index c7b228097bd9..ee533a5ceb79 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } + +void vma_iter_dump_tree(const struct vma_iterator *vmi) +{ +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mas_dump(&vmi->mas); + mt_dump(vmi->mas.tree, mt_dump_hex); +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +} + #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/internal.h b/mm/internal.h index 692498a84fde..41cc5e6225fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1064,13 +1064,14 @@ static inline void vma_iter_store(struct vma_iterator *vmi, printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree, mt_dump_hex); + vma_iter_dump_tree(vmi); } if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); mt_dump(vmi->mas.tree, mt_dump_hex); + vma_iter_dump_tree(vmi); } #endif diff --git a/mm/mmap.c b/mm/mmap.c index 04bcf3b3c720..8f67d80c6dde 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -300,61 +300,40 @@ out: } #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt, enum mt_dump_format fmt); - -/* Validate the maple tree */ -static void validate_mm_mt(struct mm_struct *mm) -{ - struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt; - - MA_STATE(mas, mt, 0, 0); - - mt_validate(&mm->mm_mt); - mas_for_each(&mas, vma_mt, ULONG_MAX) { - if ((vma_mt->vm_start != mas.index) || - (vma_mt->vm_end - 1 != mas.last)) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma_mt); - pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, - mas.index, mas.last); - pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, - vma_mt->vm_start, vma_mt->vm_end); - - mt_dump(mas.tree, mt_dump_hex); - if (vma_mt->vm_end != mas.last + 1) { - pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", - mm, vma_mt->vm_start, vma_mt->vm_end, - mas.index, mas.last); - mt_dump(mas.tree, mt_dump_hex); - } - VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); - if (vma_mt->vm_start != mas.index) { - pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", - mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree, mt_dump_hex); - } - VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); - } - } -} - static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, 0, 0); - - validate_mm_mt(mm); + VMA_ITERATOR(vmi, mm, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + mt_validate(&mm->mm_mt); + for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } + +#ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) @@ -365,14 +344,13 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ @@ -2234,7 +2212,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2292,7 +2270,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2301,7 +2279,7 @@ out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return err; } @@ -2936,7 +2914,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, arch_unmap(mm, start, end); ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); - validate_mm_mt(mm); + validate_mm(mm); return ret; } @@ -2958,7 +2936,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm_mt(mm); + validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3199,7 +3177,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm_mt(mm); + validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3258,7 +3236,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm_mt(mm); + validate_mm(mm); return new_vma; out_vma_link: @@ -3274,7 +3252,7 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: - validate_mm_mt(mm); + validate_mm(mm); return NULL; } @@ -3411,7 +3389,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm_mt(mm); + validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3434,12 +3412,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm_mt(mm); + validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm_mt(mm); + validate_mm(mm); return ERR_PTR(ret); } -- cgit v1.2.3 From 15c0c60b8cee4a6db263585520f994654429fac3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:31 -0400 Subject: mm/mmap: change do_vmi_align_munmap() for maple tree iterator changes The maple tree iterator clean up is incompatible with the way do_vmi_align_munmap() expects it to behave. Update the expected behaviour to map now since the change will work currently. Link: https://lkml.kernel.org/r/20230518145544.1722059-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 8f67d80c6dde..19d3c843be0c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2388,7 +2388,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #endif } - next = vma_next(vmi); + if (vma_iter_end(vmi) > end) + next = vma_iter_load(vmi); + + if (!next) + next = vma_next(vmi); + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas -- cgit v1.2.3 From 5c1c03de1b1636f041a275e777171307cac8d958 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:44 -0400 Subject: mm: avoid rewalk in mmap_region If the iterator has moved to the previous entry, then step forward one range, back to the gap. Link: https://lkml.kernel.org/r/20230518145544.1722059-36-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 19d3c843be0c..44be7fdfaac9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2606,6 +2606,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } cannot_expand: + if (prev) + vma_iter_next_range(&vmi); + /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but -- cgit v1.2.3 From 3c54a298db4c6b38bbd5f86216ce0f5ad4596ccf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 22 May 2023 09:24:12 +0100 Subject: mm/mmap: refactor mlock_future_check() In all but one instance, mlock_future_check() is treated as a boolean function despite returning an error code. In one instance, this error code is ignored and replaced with -ENOMEM. This is confusing, and the inversion of true -> failure, false -> success is not warranted. Convert the function to a bool, lightly refactor and return true if the check passes, false if not. Link: https://lkml.kernel.org/r/20230522082412.56685-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Liam Howlett Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/internal.h | 4 ++-- mm/mmap.c | 33 +++++++++++++++++---------------- mm/mremap.c | 2 +- mm/secretmem.c | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/internal.h b/mm/internal.h index bb6542279599..66dd214b302a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -576,8 +576,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len); +extern bool mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long bytes); /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, diff --git a/mm/mmap.c b/mm/mmap.c index 44be7fdfaac9..28d2c489a7e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len) if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; - return mlock_future_check(current->mm, current->mm->def_flags, len); + return mlock_future_check(current->mm, current->mm->def_flags, len) + ? 0 : -EAGAIN; } static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); @@ -1145,21 +1146,21 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len) +bool mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long bytes) { - unsigned long locked, lock_limit; + unsigned long locked_pages, limit_pages; - /* mlock MCL_FUTURE? */ - if (flags & VM_LOCKED) { - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - return 0; + if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) @@ -1271,7 +1272,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - if (mlock_future_check(mm, vm_flags, len)) + if (!mlock_future_check(mm, vm_flags, len)) return -EAGAIN; if (file) { @@ -1889,7 +1890,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) + if (!mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ diff --git a/mm/mremap.c b/mm/mremap.c index b11ce6c92099..bcfcb8df5875 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -775,7 +775,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return ERR_PTR(-EFAULT); - if (mlock_future_check(mm, vma->vm_flags, new_len - old_len)) + if (!mlock_future_check(mm, vma->vm_flags, new_len - old_len)) return ERR_PTR(-EAGAIN); if (!may_expand_vm(mm, vma->vm_flags, diff --git a/mm/secretmem.c b/mm/secretmem.c index 974b32ba8b9d..58d2af12df4f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -125,7 +125,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); -- cgit v1.2.3 From b0cc5e89caadc21c2d8f7dc4c97947761b358876 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 22 May 2023 13:52:10 -0700 Subject: mm/mlock: rename mlock_future_check() to mlock_future_ok() It is felt that the name mlock_future_check() is vague - it doesn't particularly convey the function's operation. mlock_future_ok() is a clearer name for a predicate function. Acked-by: Vlastimil Babka Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/mmap.c | 8 ++++---- mm/mremap.c | 2 +- mm/secretmem.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/internal.h b/mm/internal.h index 66dd214b302a..f45f5eb4514f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -576,7 +576,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_check(struct mm_struct *mm, unsigned long flags, +extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * mlock_vma_folio() and munlock_vma_folio(): diff --git a/mm/mmap.c b/mm/mmap.c index 28d2c489a7e5..e1624cb2c04e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; - return mlock_future_check(current->mm, current->mm->def_flags, len) + return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, @@ -1146,7 +1146,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_check(struct mm_struct *mm, unsigned long flags, +bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; @@ -1272,7 +1272,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - if (!mlock_future_check(mm, vm_flags, len)) + if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { @@ -1890,7 +1890,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ diff --git a/mm/mremap.c b/mm/mremap.c index bcfcb8df5875..da107f2c71bf 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -775,7 +775,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return ERR_PTR(-EFAULT); - if (!mlock_future_check(mm, vma->vm_flags, new_len - old_len)) + if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len)) return ERR_PTR(-EAGAIN); if (!may_expand_vm(mm, vma->vm_flags, diff --git a/mm/secretmem.c b/mm/secretmem.c index 58d2af12df4f..86442a15d12f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -125,7 +125,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (!mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); -- cgit v1.2.3 From 54cbbbf3faf610fb4eba6f8d39d933bcbfc6f4de Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 4 May 2023 22:27:51 +0100 Subject: mm/mmap: separate writenotify and dirty tracking logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: disallow GUP writing to file-backed mappings by default", v9. Writing to file-backed mappings which require folio dirty tracking using GUP is a fundamentally broken operation, as kernel write access to GUP mappings do not adhere to the semantics expected by a file system. A GUP caller uses the direct mapping to access the folio, which does not cause write notify to trigger, nor does it enforce that the caller marks the folio dirty. The problem arises when, after an initial write to the folio, writeback results in the folio being cleaned and then the caller, via the GUP interface, writes to the folio again. As a result of the use of this secondary, direct, mapping to the folio no write notify will occur, and if the caller does mark the folio dirty, this will be done so unexpectedly. For example, consider the following scenario:- 1. A folio is written to via GUP which write-faults the memory, notifying the file system and dirtying the folio. 2. Later, writeback is triggered, resulting in the folio being cleaned and the PTE being marked read-only. 3. The GUP caller writes to the folio, as it is mapped read/write via the direct mapping. 4. The GUP caller, now done with the page, unpins it and sets it dirty (though it does not have to). This change updates both the PUP FOLL_LONGTERM slow and fast APIs. As pin_user_pages_fast_only() does not exist, we can rely on a slightly imperfect whitelisting in the PUP-fast case and fall back to the slow case should this fail. This patch (of 3): vma_wants_writenotify() is specifically intended for setting PTE page table flags, accounting for existing page table flag state and whether the underlying filesystem performs dirty tracking for a file-backed mapping. Everything is predicated firstly on whether the mapping is shared writable, as this is the only instance where dirty tracking is pertinent - MAP_PRIVATE mappings will always be CoW'd and unshared, and read-only file-backed shared mappings cannot be written to, even with FOLL_FORCE. All other checks are in line with existing logic, though now separated into checks eplicitily for dirty tracking and those for determining how to set page table flags. We make this change so we can perform checks in the GUP logic to determine which mappings might be problematic when written to. Link: https://lkml.kernel.org/r/cover.1683235180.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/0f218370bd49b4e6bbfbb499f7c7b92c26ba1ceb.1683235180.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: John Hubbard Reviewed-by: Mika Penttilä Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Cc: Kirill A . Shutemov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/mmap.c | 58 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 12 deletions(-) (limited to 'mm/mmap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 62bb3272e531..66032f0d515c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2461,6 +2461,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { diff --git a/mm/mmap.c b/mm/mmap.c index e1624cb2c04e..f084b7940431 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1454,6 +1454,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1462,21 +1504,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { - vm_flags_t vm_flags = vma->vm_flags; - const struct vm_operations_struct *vm_ops = vma->vm_ops; - /* If it was private or non-writable, the write bit is already clear */ - if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + if (!vma_is_shared_writable(vma)) return 0; /* The backer wishes to know when pages are first written to? */ - if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + if (vm_ops_needs_writenotify(vma->vm_ops)) return 1; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return 0; /* @@ -1490,13 +1529,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (userfaultfd_wp(vma)) return 1; - /* Specialty mapping? */ - if (vm_flags & VM_PFNMAP) - return 0; - /* Can the mapping track the dirty pages? */ - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); + return vma_fs_can_writeback(vma); } /* -- cgit v1.2.3 From 606c812eb1d5b5fb0dd9e330ca94b52d7c227830 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Jun 2023 20:47:08 -0400 Subject: mm/mmap: Fix error path in do_vmi_align_munmap() The error unrolling was leaving the VMAs detached in many cases and leaving the locked_vm statistic altered, and skipping the unrolling entirely in the case of the vma tree write failing. Fix the error path by re-attaching the detached VMAs and adding the necessary goto for the failed vma tree write, and fix the locked_vm statistic by only updating after the vma tree write succeeds. Fixes: 763ecb035029 ("mm: remove the vma linked list") Reported-by: Vegard Nossum Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- mm/mmap.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..d600404580b2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -static inline int munmap_sidetree(struct vm_area_struct *vma, - struct ma_state *mas_detach) -{ - vma_start_write(vma); - mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); - if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) - return -ENOMEM; - - vma_mark_detached(vma, true); - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm -= vma_pages(vma); - - return 0; -} - /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct maple_tree mt_detach; int count = 0; int error = -ENOMEM; + unsigned long locked_vm = 0; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); @@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; } - error = munmap_sidetree(next, &mas_detach); - if (error) - goto munmap_sidetree_failed; + vma_start_write(next); + mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + if (mas_store_gfp(&mas_detach, next, GFP_KERNEL)) + goto munmap_gather_failed; + vma_mark_detached(next, true); + if (next->vm_flags & VM_LOCKED) + locked_vm += vma_pages(next); count++; #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } #endif /* Point of no return */ + error = -ENOMEM; vma_iter_set(vmi, start); if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) - return -ENOMEM; + goto clear_tree_failed; + mm->locked_vm -= locked_vm; mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, validate_mm(mm); return downgrade ? 1 : 0; +clear_tree_failed: userfaultfd_error: -munmap_sidetree_failed: +munmap_gather_failed: end_split_failed: + mas_set(&mas_detach, 0); + mas_for_each(&mas_detach, next, end) + vma_mark_detached(next, false); + __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: -- cgit v1.2.3 From 65ac132027a884c411b8f9f96d240ba2dde34dec Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 31 May 2023 21:54:02 -0400 Subject: userfaultfd: fix regression in userfaultfd_unmap_prep() Android reported a performance regression in the userfaultfd unmap path. A closer inspection on the userfaultfd_unmap_prep() change showed that a second tree walk would be necessary in the reworked code. Fix the regression by passing each VMA that will be unmapped through to the userfaultfd_unmap_prep() function as they are added to the unmap list, instead of re-walking the tree for the VMA. Link: https://lkml.kernel.org/r/20230601015402.2819343-1-Liam.Howlett@oracle.com Fixes: 69dbe6daf104 ("userfaultfd: use maple tree iterator to iterate VMAs") Signed-off-by: Liam R. Howlett Reported-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 35 +++++++++++++++-------------------- include/linux/userfaultfd_k.h | 6 +++--- mm/mmap.c | 31 +++++++++++++++---------------- 3 files changed, 33 insertions(+), 39 deletions(-) (limited to 'mm/mmap.c') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 478e2b169c13..0aa5caac5164 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -852,31 +852,26 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *vma; - - for_each_vma_range(vmi, vma, end) { - struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || - has_unmap_ctx(ctx, unmaps, start, end)) - continue; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; - unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); - if (!unmap_ctx) - return -ENOMEM; + unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); + if (!unmap_ctx) + return -ENOMEM; - userfaultfd_ctx_get(ctx); - atomic_inc(&ctx->mmap_changing); - unmap_ctx->ctx = ctx; - unmap_ctx->start = start; - unmap_ctx->end = end; - list_add_tail(&unmap_ctx->list, unmaps); - } + userfaultfd_ctx_get(ctx); + atomic_inc(&ctx->mmap_changing); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); return 0; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d78b01524349..ac7b0c96d351 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -188,8 +188,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, - unsigned long end, struct list_head *uf); +extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); @@ -271,7 +271,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct mm_struct *mm, +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/mm/mmap.c b/mm/mmap.c index f084b7940431..4fc496bc5b95 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2417,6 +2417,21 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, goto munmap_sidetree_failed; count++; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, start, end, uf); + + if (error) + goto userfaultfd_error; + } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end); @@ -2429,22 +2444,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!next) next = vma_next(vmi); - if (unlikely(uf)) { - /* - * If userfaultfd_unmap_prep returns an error the vmas - * will remain split, but userland will get a - * highly unexpected error anyway. This is no - * different than the case where the first of the two - * __split_vma fails, but we don't undo the first - * split, despite we could. This is unlikely enough - * failure that it's not worth optimizing it for. - */ - error = userfaultfd_unmap_prep(mm, start, end, uf); - - if (error) - goto userfaultfd_error; - } - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { -- cgit v1.2.3 From 8b35ca3e45e35a26a21427f35d4093606e93ad0a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 22 Jun 2023 21:24:30 +0200 Subject: arm/mm: Convert to using lock_mm_and_find_vma() arm has an additional check for address < FIRST_USER_ADDRESS before expanding the stack. Since FIRST_USER_ADDRESS is defined everywhere (generally as 0), move that check to the generic expand_downwards(). Signed-off-by: Ben Hutchings Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 63 ++++++++++++----------------------------------------- mm/mmap.c | 2 +- 3 files changed, 16 insertions(+), 50 deletions(-) (limited to 'mm/mmap.c') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0fb4b218f665..9ed7f03ba15a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -125,6 +125,7 @@ config ARM select HAVE_UID16 select HAVE_VIRT_CPU_ACCOUNTING_GEN select IRQ_FORCED_THREADING + select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE select OF_EARLY_FLATTREE if OF diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 2418f1efabd8..0860eeba8bd3 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -232,37 +232,11 @@ static inline bool is_permission_fault(unsigned int fsr) return false; } -static vm_fault_t __kprobes -__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int flags, - unsigned long vma_flags, struct pt_regs *regs) -{ - struct vm_area_struct *vma = find_vma(mm, addr); - if (unlikely(!vma)) - return VM_FAULT_BADMAP; - - if (unlikely(vma->vm_start > addr)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - return VM_FAULT_BADMAP; - if (addr < FIRST_USER_ADDRESS) - return VM_FAULT_BADMAP; - if (expand_stack(vma, addr)) - return VM_FAULT_BADMAP; - } - - /* - * ok, we have a good vm_area for this memory access, check the - * permissions on the VMA allow for the fault which occurred. - */ - if (!(vma->vm_flags & vma_flags)) - return VM_FAULT_BADACCESS; - - return handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); -} - static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; int sig, code; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; @@ -301,31 +275,21 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) - goto no_context; retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->ARM_pc)) - goto no_context; -#endif + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + fault = VM_FAULT_BADMAP; + goto bad_area; } - fault = __do_page_fault(mm, addr, flags, vm_flags, regs); + /* + * ok, we have a good vm_area for this memory access, check the + * permissions on the VMA allow for the fault which occurred. + */ + if (!(vma->vm_flags & vm_flags)) + fault = VM_FAULT_BADACCESS; + else + fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_lock because @@ -356,6 +320,7 @@ retry: if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; +bad_area: /* * If we are in kernel mode at this point, we * have no context to handle this fault with. diff --git a/mm/mmap.c b/mm/mmap.c index d600404580b2..6d120bf1d0bc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2036,7 +2036,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; address &= PAGE_MASK; - if (address < mmap_min_addr) + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ -- cgit v1.2.3 From f440fa1ac955e2898893f9301568435eb5cdfc4b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 16 Jun 2023 15:58:54 -0700 Subject: mm: make find_extend_vma() fail if write lock not held Make calls to extend_vma() and find_extend_vma() fail if the write lock is required. To avoid making this a flag-day event, this still allows the old read-locking case for the trivial situations, and passes in a flag to say "is it write-locked". That way write-lockers can say "yes, I'm being careful", and legacy users will continue to work in all the common cases until they have been fully converted to the new world order. Co-Developed-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 6 +++--- fs/exec.c | 5 +++-- include/linux/mm.h | 10 +++++++--- mm/memory.c | 2 +- mm/mmap.c | 50 +++++++++++++++++++++++++++++++++----------------- mm/nommu.c | 3 ++- 6 files changed, 49 insertions(+), 27 deletions(-) (limited to 'mm/mmap.c') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1033fbdfdbec..869c3aa0e455 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - if (mmap_read_lock_killable(mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma(mm, bprm->p); - mmap_read_unlock(mm); + vma = find_extend_vma_locked(mm, bprm->p, true); + mmap_write_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2..a61eb256e5e4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -205,7 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_STACK_GROWSUP if (write) { - ret = expand_downwards(bprm->vma, pos); + /* We claim to hold the lock - nobody to race with */ + ret = expand_downwards(bprm->vma, pos, true); if (ret < 0) return NULL; } @@ -853,7 +854,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack(vma, stack_base); + ret = expand_stack_locked(vma, stack_base, true); if (ret) ret = -EFAULT; diff --git a/include/linux/mm.h b/include/linux/mm.h index 570cf906fbcc..01a016521b60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3192,11 +3192,13 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked); +#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -extern int expand_downwards(struct vm_area_struct *vma, - unsigned long address); +int expand_downwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked); #if VM_GROWSUP extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); #else @@ -3297,6 +3299,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, #endif struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, + unsigned long addr, bool write_locked); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 1dff248805bf..a81f5d0997ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, goto fail; } - if (expand_stack(vma, addr)) + if (expand_stack_locked(vma, addr, true)) goto fail; success: diff --git a/mm/mmap.c b/mm/mmap.c index 6d120bf1d0bc..2c44ac108a3c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1935,7 +1935,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_upwards(struct vm_area_struct *vma, unsigned long address) +int expand_upwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; @@ -1959,6 +1960,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; + if (!write_locked) + return -EAGAIN; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) @@ -2028,7 +2031,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { struct mm_struct *mm = vma->vm_mm; MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); @@ -2042,10 +2046,13 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) /* Enforce stack_guard_gap */ prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ - if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev)) { - if (address - prev->vm_end < stack_guard_gap) + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; + if (!write_locked && (prev->vm_end == address)) + return -EAGAIN; } if (mas_preallocate(&mas, GFP_KERNEL)) @@ -2124,13 +2131,14 @@ static int __init cmdline_parse_stack_guard_gap(char *p) __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { - return expand_upwards(vma, address); + return expand_upwards(vma, address, write_locked); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, + unsigned long addr, bool write_locked) { struct vm_area_struct *vma, *prev; @@ -2138,20 +2146,25 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (!prev) + return NULL; + if (expand_stack_locked(prev, addr, write_locked)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { - return expand_downwards(vma, address); + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) + return -EINVAL; + return expand_downwards(vma, address, write_locked); } -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, + unsigned long addr, bool write_locked) { struct vm_area_struct *vma; unsigned long start; @@ -2162,10 +2175,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return NULL; if (vma->vm_start <= addr) return vma; - if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; start = vma->vm_start; - if (expand_stack(vma, addr)) + if (expand_stack_locked(vma, addr, write_locked)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); @@ -2173,6 +2184,11 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) } #endif +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, + unsigned long addr) +{ + return find_extend_vma_locked(mm, addr, false); +} EXPORT_SYMBOL_GPL(find_extend_vma); /* diff --git a/mm/nommu.c b/mm/nommu.c index f670d9979a26..f476c9ed36b3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -643,7 +643,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, + bool write_locked) { return -ENOMEM; } -- cgit v1.2.3 From 8d7071af890768438c14db6172cc8f9f4d04e184 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2023 13:45:51 -0700 Subject: mm: always expand the stack with the mmap write lock held This finishes the job of always holding the mmap write lock when extending the user stack vma, and removes the 'write_locked' argument from the vm helper functions again. For some cases, we just avoid expanding the stack at all: drivers and page pinning really shouldn't be extending any stacks. Let's see if any strange users really wanted that. It's worth noting that architectures that weren't converted to the new lock_mm_and_find_vma() helper function are left using the legacy "expand_stack()" function, but it has been changed to drop the mmap_lock and take it for writing while expanding the vma. This makes it fairly straightforward to convert the remaining architectures. As a result of dropping and re-taking the lock, the calling conventions for this function have also changed, since the old vma may no longer be valid. So it will now return the new vma if successful, and NULL - and the lock dropped - if the area could not be extended. Tested-by: Vegard Nossum Tested-by: John Paul Adrian Glaubitz # ia64 Tested-by: Frank Scheiner # ia64 Signed-off-by: Linus Torvalds --- arch/ia64/mm/fault.c | 36 +++---------- arch/m68k/mm/fault.c | 9 ++-- arch/microblaze/mm/fault.c | 5 +- arch/openrisc/mm/fault.c | 5 +- arch/parisc/mm/fault.c | 23 ++++---- arch/s390/mm/fault.c | 5 +- arch/sparc/mm/fault_64.c | 8 +-- arch/um/kernel/trap.c | 11 ++-- drivers/iommu/amd/iommu_v2.c | 4 +- drivers/iommu/iommu-sva.c | 2 +- fs/binfmt_elf.c | 2 +- fs/exec.c | 4 +- include/linux/mm.h | 16 ++---- mm/gup.c | 6 +-- mm/memory.c | 10 +++- mm/mmap.c | 121 ++++++++++++++++++++++++++++++++++--------- mm/nommu.c | 18 +++---- 17 files changed, 169 insertions(+), 116 deletions(-) (limited to 'mm/mmap.c') diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 85c4d9ac8686..5458b52b4009 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -110,10 +110,12 @@ retry: * register backing store that needs to expand upwards, in * this case vma will be null, but prev_vma will ne non-null */ - if (( !vma && prev_vma ) || (address < vma->vm_start) ) - goto check_expansion; + if (( !vma && prev_vma ) || (address < vma->vm_start) ) { + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } - good_area: code = SEGV_ACCERR; /* OK, we've got a good vm_area for this memory area. Check the access permissions: */ @@ -177,35 +179,9 @@ retry: mmap_read_unlock(mm); return; - check_expansion: - if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) { - if (!vma) - goto bad_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; - } else { - vma = prev_vma; - if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) - || REGION_OFFSET(address) >= RGN_MAP_LIMIT) - goto bad_area; - /* - * Since the register backing store is accessed sequentially, - * we disallow growing it by more than a page at a time. - */ - if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) - goto bad_area; - if (expand_upwards(vma, address)) - goto bad_area; - } - goto good_area; - bad_area: mmap_read_unlock(mm); + bad_area_nosemaphore: if ((isr & IA64_ISR_SP) || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) { diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 228128e45c67..c290c5c0cfb9 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -105,8 +105,9 @@ retry: if (address + 256 < rdusp()) goto map_err; } - if (expand_stack(vma, address)) - goto map_err; + vma = expand_stack(mm, address); + if (!vma) + goto map_err_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so @@ -196,10 +197,12 @@ bus_err: goto send_sig; map_err: + mmap_read_unlock(mm); +map_err_nosemaphore: current->thread.signo = SIGSEGV; current->thread.code = SEGV_MAPERR; current->thread.faddr = address; - goto send_sig; + return send_fault_sig(regs); acc_err: current->thread.signo = SIGSEGV; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 687714db6f4d..d3c3c33b73a6 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -192,8 +192,9 @@ retry: && (kernel_mode(regs) || !store_updates_sp(regs))) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; good_area: code = SEGV_ACCERR; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 6734fee3134f..a9dcd4381d1a 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -127,8 +127,9 @@ retry: if (address + PAGE_SIZE < regs->sp) goto bad_area; } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 6941fdbf2517..6e894afa4249 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -288,15 +288,19 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, retry: mmap_read_lock(mm); vma = find_vma_prev(mm, address, &prev_vma); - if (!vma || address < vma->vm_start) - goto check_expansion; + if (!vma || address < vma->vm_start) { + if (!prev || !(prev->vm_flags & VM_GROWSUP)) + goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; + } + /* * Ok, we have a good vm_area for this memory access. We still need to * check the access permissions. */ -good_area: - if ((vma->vm_flags & acc_type) != acc_type) goto bad_area; @@ -347,17 +351,13 @@ good_area: mmap_read_unlock(mm); return; -check_expansion: - vma = prev_vma; - if (vma && (expand_stack(vma, address) == 0)) - goto good_area; - /* * Something tried to access memory that isn't in our memory map.. */ bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: if (user_mode(regs)) { int signo, si_code; @@ -449,7 +449,7 @@ handle_nadtlb_fault(struct pt_regs *regs) { unsigned long insn = regs->iir; int breg, treg, xreg, val = 0; - struct vm_area_struct *vma, *prev_vma; + struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; unsigned long address; @@ -485,7 +485,7 @@ handle_nadtlb_fault(struct pt_regs *regs) /* Search for VMA */ address = regs->ior; mmap_read_lock(mm); - vma = find_vma_prev(mm, address, &prev_vma); + vma = vma_lookup(mm, address); mmap_read_unlock(mm); /* @@ -494,7 +494,6 @@ handle_nadtlb_fault(struct pt_regs *regs) */ acc_type = (insn & 0x40) ? VM_WRITE : VM_READ; if (vma - && address >= vma->vm_start && (vma->vm_flags & acc_type) == acc_type) val = 1; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b65144c392b0..dbe8394234e2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -457,8 +457,9 @@ retry: if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; - if (expand_stack(vma, address)) - goto out_up; + vma = expand_stack(mm, address); + if (!vma) + goto out; } /* diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index d91305de694c..69ff07bc6c07 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -383,8 +383,9 @@ continue_fault: goto bad_area; } } - if (expand_stack(vma, address)) - goto bad_area; + vma = expand_stack(mm, address); + if (!vma) + goto bad_area_nosemaphore; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. @@ -487,8 +488,9 @@ exit_exception: * Fix it, but check if it's kernel or user first.. */ bad_area: - insn = get_fault_insn(regs, insn); mmap_read_unlock(mm); +bad_area_nosemaphore: + insn = get_fault_insn(regs, insn); handle_kernel_fault: do_kernel_fault(regs, si_code, fault_code, insn, address); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index d3ce21c4ca32..6d8ae86ae978 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -47,14 +47,15 @@ retry: vma = find_vma(mm, address); if (!vma) goto out; - else if (vma->vm_start <= address) + if (vma->vm_start <= address) goto good_area; - else if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - else if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - else if (expand_stack(vma, address)) + if (is_user && !ARCH_IS_STACKGROW(address)) goto out; + vma = expand_stack(mm, address); + if (!vma) + goto out_nosemaphore; good_area: *code_out = SEGV_ACCERR; diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 864e4ffb6aa9..261352a23271 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -485,8 +485,8 @@ static void do_fault(struct work_struct *work) flags |= FAULT_FLAG_REMOTE; mmap_read_lock(mm); - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = vma_lookup(mm, address); + if (!vma) /* failed to get a vma in the right range */ goto out; diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 9821bc44f5ac..3ebd4b6586b3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -175,7 +175,7 @@ iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) mmap_read_lock(mm); - vma = find_extend_vma(mm, prm->addr); + vma = vma_lookup(mm, prm->addr); if (!vma) /* Unmapped area */ goto out_put_mm; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 869c3aa0e455..befa93582ed7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -322,7 +322,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, */ if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma_locked(mm, bprm->p, true); + vma = find_extend_vma_locked(mm, bprm->p); mmap_write_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/exec.c b/fs/exec.c index 66e3e22ffb8a..b84b4fee0f82 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -211,7 +211,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); - ret = expand_downwards(vma, pos, true); + ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; @@ -859,7 +859,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack_locked(vma, stack_base, true); + ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/include/linux/mm.h b/include/linux/mm.h index 01a016521b60..4a9533efbd5d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3192,18 +3192,11 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked); -#define expand_stack(vma,addr) expand_stack_locked(vma,addr,false) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); +struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked); -#if VM_GROWSUP -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); -#else - #define expand_upwards(vma, address) (0) -#endif +int expand_downwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); @@ -3298,9 +3291,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, - unsigned long addr, bool write_locked); + unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/gup.c b/mm/gup.c index bbe416236593..e6cdfee4451f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1096,7 +1096,7 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { - vma = find_extend_vma(mm, start); + vma = vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, @@ -1265,8 +1265,8 @@ int fixup_user_fault(struct mm_struct *mm, fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) + vma = vma_lookup(mm, address); + if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) diff --git a/mm/memory.c b/mm/memory.c index a81f5d0997ad..5ce82a76201d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5368,7 +5368,7 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, goto fail; } - if (expand_stack_locked(vma, addr, true)) + if (expand_stack_locked(vma, addr)) goto fail; success: @@ -5713,6 +5713,14 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, if (mmap_read_lock_killable(mm)) return 0; + /* We might need to expand the stack to access it */ + vma = vma_lookup(mm, addr); + if (!vma) { + vma = expand_stack(mm, addr); + if (!vma) + return 0; + } + /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, ret, offset; diff --git a/mm/mmap.c b/mm/mmap.c index 2c44ac108a3c..bc510361acec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1935,8 +1935,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_upwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +static int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; @@ -1960,8 +1959,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address, if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - if (!write_locked) - return -EAGAIN; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) @@ -2030,15 +2027,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address, /* * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; @@ -2051,8 +2051,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address, vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; - if (!write_locked && (prev->vm_end == address)) - return -EAGAIN; } if (mas_preallocate(&mas, GFP_KERNEL)) @@ -2131,14 +2129,12 @@ static int __init cmdline_parse_stack_guard_gap(char *p) __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { - return expand_upwards(vma, address, write_locked); + return expand_upwards(vma, address); } -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, - unsigned long addr, bool write_locked) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; @@ -2148,23 +2144,21 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, return vma; if (!prev) return NULL; - if (expand_stack_locked(prev, addr, write_locked)) + if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) return -EINVAL; - return expand_downwards(vma, address, write_locked); + return expand_downwards(vma, address); } -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, - unsigned long addr, bool write_locked) +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; @@ -2176,7 +2170,7 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, if (vma->vm_start <= addr) return vma; start = vma->vm_start; - if (expand_stack_locked(vma, addr, write_locked)) + if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); @@ -2184,12 +2178,91 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, } #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, - unsigned long addr) +/* + * IA64 has some horrid mapping rules: it can expand both up and down, + * but with various special rules. + * + * We'll get rid of this architecture eventually, so the ugliness is + * temporary. + */ +#ifdef CONFIG_IA64 +static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr) +{ + return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) && + REGION_OFFSET(addr) < RGN_MAP_LIMIT; +} + +/* + * IA64 stacks grow down, but there's a special register backing store + * that can grow up. Only sequentially, though, so the new address must + * match vm_end. + */ +static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + if (vma->vm_end != (addr & PAGE_MASK)) + return -EFAULT; + return expand_upwards(vma, addr); +} + +static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr) +{ + if (!vma_expand_ok(vma, addr)) + return -EFAULT; + return expand_downwards(vma, addr); +} + +#elif defined(CONFIG_STACK_GROWSUP) + +#define vma_expand_up(vma,addr) expand_upwards(vma, addr) +#define vma_expand_down(vma, addr) (-EFAULT) + +#else + +#define vma_expand_up(vma,addr) (-EFAULT) +#define vma_expand_down(vma, addr) expand_downwards(vma, addr) + +#endif + +/* + * expand_stack(): legacy interface for page faulting. Don't use unless + * you have to. + * + * This is called with the mm locked for reading, drops the lock, takes + * the lock for writing, tries to look up a vma again, expands it if + * necessary, and downgrades the lock to reading again. + * + * If no vma is found or it can't be expanded, it returns NULL and has + * dropped the lock. + */ +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { - return find_extend_vma_locked(mm, addr, false); + struct vm_area_struct *vma, *prev; + + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) + return NULL; + + vma = find_vma_prev(mm, addr, &prev); + if (vma && vma->vm_start <= addr) + goto success; + + if (prev && !vma_expand_up(prev, addr)) { + vma = prev; + goto success; + } + + if (vma && !vma_expand_down(vma, addr)) + goto success; + + mmap_write_unlock(mm); + return NULL; + +success: + mmap_write_downgrade(mm); + return vma; } -EXPORT_SYMBOL_GPL(find_extend_vma); /* * Ok - we have the memory areas we should free on a maple tree so release them, diff --git a/mm/nommu.c b/mm/nommu.c index f476c9ed36b3..37d0b03143f1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -630,25 +630,21 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) } EXPORT_SYMBOL(find_vma); -/* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions - */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - return find_vma(mm, addr); -} - /* * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address, - bool write_locked) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) { return -ENOMEM; } +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_unlock(mm); + return NULL; +} + /* * look up the first VMA exactly that exactly matches addr * - should be called with mm->mmap_lock at least held readlocked -- cgit v1.2.3 From 6c26bd4384da24841bac4f067741bbca18b0fb74 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 28 Jun 2023 10:55:03 +0100 Subject: mm/mmap: Fix error return in do_vmi_align_munmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If mas_store_gfp() in the gather loop failed, the 'error' variable that ultimately gets returned was not being set. In many cases, its original value of -ENOMEM was still in place, and that was fine. But if VMAs had been split at the start or end of the range, then 'error' could be zero. Change to the 'error = foo(); if (error) goto …' idiom to fix the bug. Also clean up a later case which avoided the same bug by *explicitly* setting error = -ENOMEM right before calling the function that might return -ENOMEM. In a final cosmetic change, move the 'Point of no return' comment to *after* the goto. That's been in the wrong place since the preallocation was removed, and this new error path was added. Fixes: 606c812eb1d5 ("mm/mmap: Fix error path in do_vmi_align_munmap()") Signed-off-by: David Woodhouse Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Reviewed-by: Liam R. Howlett --- mm/mmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index d600404580b2..13128e908470 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2387,7 +2387,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } vma_start_write(next); mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); - if (mas_store_gfp(&mas_detach, next, GFP_KERNEL)) + error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); + if (error) goto munmap_gather_failed; vma_mark_detached(next, true); if (next->vm_flags & VM_LOCKED) @@ -2436,12 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(count != test_count); } #endif - /* Point of no return */ - error = -ENOMEM; vma_iter_set(vmi, start); - if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); + if (error) goto clear_tree_failed; + /* Point of no return */ mm->locked_vm -= locked_vm; mm->map_count -= count; /* -- cgit v1.2.3 From e4bd84c069f212c01258e405f86e91f327888e41 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Jun 2023 20:14:14 +0100 Subject: mm: Always downgrade mmap_lock if requested Now that stack growth must always hold the mmap_lock for write, we can always downgrade the mmap_lock to read and safely unmap pages from the page table, even if we're next to a stack. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Linus Torvalds --- mm/mmap.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 3e5793ebbaae..141c618847ac 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2551,19 +2551,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Point of no return */ mm->locked_vm -= locked_vm; mm->map_count -= count; - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (downgrade) { - if (next && (next->vm_flags & VM_GROWSDOWN)) - downgrade = false; - else if (prev && (prev->vm_flags & VM_GROWSUP)) - downgrade = false; - else - mmap_write_downgrade(mm); - } + if (downgrade) + mmap_write_downgrade(mm); /* * We can free page tables without write-locking mmap_lock because VMAs -- cgit v1.2.3 From 408579cd627a15bd703fe3eeb8485fd02726e9d3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 29 Jun 2023 22:28:16 -0400 Subject: mm: Update do_vmi_align_munmap() return semantics Since do_vmi_align_munmap() will always honor the downgrade request on the success, the callers no longer have to deal with confusing return codes. Since all callers that request downgrade actually want the lock to be dropped, change the downgrade to an unlock request. Note that the lock still needs to be held in read mode during the page table clean up to avoid races with a map request. Update do_vmi_align_munmap() to return 0 for success. Clean up the callers and comments to always expect the unlock to be honored on the success path. The error path will always leave the lock untouched. As part of the cleanup, the wrapper function do_vmi_munmap() and callers to the wrapper are also updated. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/linux-mm/20230629191414.1215929-1-willy@infradead.org/ Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +-- mm/mmap.c | 94 +++++++++++++++++++++++++----------------------------- mm/mremap.c | 28 +++++++--------- 3 files changed, 57 insertions(+), 69 deletions(-) (limited to 'mm/mmap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f2c33c273eb..703ba8203da3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3177,7 +3177,7 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, - bool downgrade); + bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); @@ -3185,7 +3185,7 @@ extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, #ifdef CONFIG_MMU extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct list_head *uf, bool downgrade); + struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) diff --git a/mm/mmap.c b/mm/mmap.c index 141c618847ac..51e70fa98450 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -193,8 +193,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; - bool populate; - bool downgraded = false; + bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; @@ -236,13 +235,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto success; } - /* - * Always allow shrinking brk. - * do_vma_munmap() may downgrade mmap_lock to read. - */ + /* Always allow shrinking brk. */ if (brk <= mm->brk) { - int ret; - /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); @@ -250,19 +244,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_vma_munmap() may downgrade the lock, so update it + * do_vma_munmap() will drop the lock on success, so update it * before calling do_vma_munmap(). */ mm->brk = brk; - ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true); - if (ret == 1) { - downgraded = true; - goto success; - } else if (!ret) - goto success; - - mm->brk = origbrk; - goto out; + if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true)) + goto out; + + goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) @@ -283,19 +272,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; mm->brk = brk; + if (mm->def_flags & VM_LOCKED) + populate = true; success: - populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - if (downgraded) - mmap_read_unlock(mm); - else - mmap_write_unlock(mm); + mmap_write_unlock(mm); +success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: + mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } @@ -2428,14 +2417,16 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head - * @downgrade: Set to true to attempt a write downgrade of the mmap_lock + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on + * success. * - * If @downgrade is true, check return code for potential release of the lock. + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. */ static int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, - unsigned long end, struct list_head *uf, bool downgrade) + unsigned long end, struct list_head *uf, bool unlock) { struct vm_area_struct *prev, *next = NULL; struct maple_tree mt_detach; @@ -2551,22 +2542,24 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Point of no return */ mm->locked_vm -= locked_vm; mm->map_count -= count; - if (downgrade) + if (unlock) mmap_write_downgrade(mm); /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ - unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade); + unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock); /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); __mt_destroy(&mt_detach); + if (unlock) + mmap_read_unlock(mm); validate_mm(mm); - return downgrade ? 1 : 0; + return 0; clear_tree_failed: userfaultfd_error: @@ -2589,18 +2582,18 @@ map_count_exceeded: * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head - * @downgrade: set to true if the user wants to attempt to write_downgrade the - * mmap_lock + * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned and any arch_unmap work will be preformed. * - * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, - bool downgrade) + bool unlock) { unsigned long end; struct vm_area_struct *vma; @@ -2617,10 +2610,13 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, /* Find the first overlapping VMA */ vma = vma_find(vmi, end); - if (!vma) + if (!vma) { + if (unlock) + mmap_write_unlock(mm); return 0; + } - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. @@ -2628,6 +2624,8 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head + * + * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) @@ -2888,7 +2886,7 @@ unacct_error: return error; } -static int __vm_munmap(unsigned long start, size_t len, bool downgrade) +static int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; @@ -2898,16 +2896,8 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade); - /* - * Returning 1 indicates mmap_lock is downgraded. - * But 1 is not legal return value of vm_munmap() and munmap(), reset - * it to 0 before return. - */ - if (ret == 1) { - mmap_read_unlock(mm); - ret = 0; - } else + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); + if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -3017,21 +3007,23 @@ out: * @start: the start of the address to unmap * @end: The end of the address to unmap * @uf: The userfaultfd list_head - * @downgrade: Attempt to downgrade or not + * @unlock: Drop the lock on success * - * Returns: 0 on success and not downgraded, 1 on success and downgraded. * unmaps a VMA mapping when the vma iterator is already in position. * Does not handle alignment. + * + * Return: 0 on success drops the lock of so directed, error on failure and will + * still hold the lock. */ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf, bool downgrade) + unsigned long start, unsigned long end, struct list_head *uf, + bool unlock) { struct mm_struct *mm = vma->vm_mm; int ret; arch_unmap(mm, start, end); - ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); + ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); validate_mm(mm); return ret; } diff --git a/mm/mremap.c b/mm/mremap.c index fe6b722ae633..11e06e4ab33b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -715,7 +715,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } vma_iter_init(&vmi, mm, old_addr); - if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { + if (!do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false)) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); @@ -913,7 +913,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct vm_area_struct *vma; unsigned long ret = -EINVAL; bool locked = false; - bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -999,24 +998,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. * do_vmi_munmap does all the needed commit accounting, and - * downgrades mmap_lock to read if so directed. + * unlocks the mmap_lock if so directed. */ if (old_len >= new_len) { - int retval; VMA_ITERATOR(vmi, mm, addr + new_len); - retval = do_vmi_munmap(&vmi, mm, addr + new_len, - old_len - new_len, &uf_unmap, true); - /* Returning 1 indicates mmap_lock is downgraded to read. */ - if (retval == 1) { - downgraded = true; - } else if (retval < 0 && old_len != new_len) { - ret = retval; + if (old_len == new_len) { + ret = addr; goto out; } + ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, + &uf_unmap, true); + if (ret) + goto out; + ret = addr; - goto out; + goto out_unlocked; } /* @@ -1101,12 +1099,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, out: if (offset_in_page(ret)) locked = false; - if (downgraded) - mmap_read_unlock(current->mm); - else - mmap_write_unlock(current->mm); + mmap_write_unlock(current->mm); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); +out_unlocked: userfaultfd_unmap_complete(mm, &uf_unmap_early); mremap_userfaultfd_complete(&uf, addr, ret, old_len); userfaultfd_unmap_complete(mm, &uf_unmap); -- cgit v1.2.3 From ae80b404198434e49e903dc3b1ba83e2c7bb3ee2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Jul 2023 10:08:50 -0700 Subject: mm: validate the mm before dropping the mmap lock Commit 408579cd627a ("mm: Update do_vmi_align_munmap() return semantics") made the return value and locking semantics of do_vmi_align_munmap() more straightforward, but in the process it ended up unlocking the mmap lock just a tad too early: the debug code doing the mmap layout validation still needs to run with the lock held, or things might change under it while it's trying to validate things. So just move the unlocking to after the validate_mm() call. Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/ZKIsoMOT71uwCIZX@xsang-OptiPlex-9020/ Fixes: 408579cd627a ("mm: Update do_vmi_align_munmap() return semantics") Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 51e70fa98450..547b40531791 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2554,11 +2554,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); __mt_destroy(&mt_detach); + validate_mm(mm); if (unlock) mmap_read_unlock(mm); - - validate_mm(mm); return 0; clear_tree_failed: -- cgit v1.2.3 From b5641a5d8b8b14643bfe3d017d64da90a5c55479 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Jul 2023 19:29:48 -0700 Subject: mm: don't do validate_mm() unnecessarily and without mmap locking This is an addition to commit ae80b4041984 ("mm: validate the mm before dropping the mmap lock"), because it turns out there were two problems, but lockdep just stopped complaining after finding the first one. The do_vmi_align_munmap() function now drops the mmap lock after doing the validate_mm() call, but it turns out that one of the callers then immediately calls validate_mm() again. That's both a bit silly, and now (again) happens without the mmap lock held. So just remove that validate_mm() call from the caller, but make sure to not lose any coverage by doing that mm sanity checking in the error path of do_vmi_align_munmap() too. Reported-and-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/ZKN6CdkKyxBShPHi@xsang-OptiPlex-9020/ Fixes: 408579cd627a ("mm: Update do_vmi_align_munmap() return semantics") Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 547b40531791..204ddcd52625 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2571,6 +2571,7 @@ end_split_failed: __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: + validate_mm(mm); return error; } @@ -3019,12 +3020,9 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, bool unlock) { struct mm_struct *mm = vma->vm_mm; - int ret; arch_unmap(mm, start, end); - ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); - validate_mm(mm); - return ret; + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* -- cgit v1.2.3 From c137381f71aec755fbf47cd4e9bd4dce752c054c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 8 Jul 2023 12:12:10 -0700 Subject: mm: lock a vma before stack expansion With recent changes necessitating mmap_lock to be held for write while expanding a stack, per-VMA locks should follow the same rules and be write-locked to prevent page faults into the VMA being expanded. Add the necessary locking. Cc: stable@vger.kernel.org Signed-off-by: Suren Baghdasaryan Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 204ddcd52625..c66e4622a557 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1977,6 +1977,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_lock in read mode. We need the @@ -2064,6 +2066,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_lock in read mode. We need the -- cgit v1.2.3 From 33313a747e81af9f31d0d45de78c9397fa3655eb Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 8 Jul 2023 12:12:11 -0700 Subject: mm: lock newly mapped VMA which can be modified after it becomes visible mmap_region adds a newly created VMA into VMA tree and might modify it afterwards before dropping the mmap_lock. This poses a problem for page faults handled under per-VMA locks because they don't take the mmap_lock and can stumble on this VMA while it's still being modified. Currently this does not pose a problem since post-addition modifications are done only for file-backed VMAs, which are not handled under per-VMA lock. However, once support for handling file-backed page faults with per-VMA locks is added, this will become a race. Fix this by write-locking the VMA before inserting it into the VMA tree. Other places where a new VMA is added into VMA tree do not modify it after the insertion, so do not need the same locking. Cc: stable@vger.kernel.org Signed-off-by: Suren Baghdasaryan Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index c66e4622a557..84c71431a527 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2812,6 +2812,8 @@ cannot_expand: if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); + /* Lock the VMA since it is modified after insertion into VMA tree */ + vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { -- cgit v1.2.3 From 1c7873e3364570ec89343ff4877e0f27a7b21a61 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 8 Jul 2023 16:04:00 -0700 Subject: mm: lock newly mapped VMA with corrected ordering Lockdep is certainly right to complain about (&vma->vm_lock->lock){++++}-{3:3}, at: vma_start_write+0x2d/0x3f but task is already holding lock: (&mapping->i_mmap_rwsem){+.+.}-{3:3}, at: mmap_region+0x4dc/0x6db Invert those to the usual ordering. Fixes: 33313a747e81 ("mm: lock newly mapped VMA which can be modified after it becomes visible") Cc: stable@vger.kernel.org Signed-off-by: Hugh Dickins Tested-by: Suren Baghdasaryan Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 84c71431a527..3eda23c9ebe7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2809,11 +2809,11 @@ cannot_expand: if (vma_iter_prealloc(&vmi)) goto close_and_free_vma; + /* Lock the VMA since it is modified after insertion into VMA tree */ + vma_start_write(vma); if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); - /* Lock the VMA since it is modified after insertion into VMA tree */ - vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { -- cgit v1.2.3