From 49b0638502da097c15d46cd4e871dbaa022caf7c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:19 -0700 Subject: mm: enable page walking API to lock vmas during the walk walk_page_range() and friends often operate under write-locked mmap_lock. With introduction of vma locks, the vmas have to be locked as well during such walks to prevent concurrent page faults in these areas. Add an additional member to mm_walk_ops to indicate locking requirements for the walk. The change ensures that page walks which prevent concurrent page faults by write-locking mmap_lock, operate correctly after introduction of per-vma locks. With per-vma locks page faults can be handled under vma lock without taking mmap_lock at all, so write locking mmap_lock would not stop them. The change ensures vmas are properly locked during such walks. A sample issue this solves is do_mbind() performing queue_pages_range() to queue pages for migration. Without this change a concurrent page can be faulted into the area and be left out of migration. Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Linus Torvalds Suggested-by: Jann Horn Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Johannes Weiner Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Michel Lespinasse Cc: Peter Xu Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mempolicy.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c53f8beeb507..ec2eaceffd74 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -718,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -738,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = { static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + struct list_head *pagelist, bool lock_vma) { int err; struct queue_pages qp = { @@ -749,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .end = end, .first = NULL, }; + const struct mm_walk_ops *ops = lock_vma ? + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; - err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ @@ -1078,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1321,12 +1331,8 @@ static long do_mbind(unsigned long start, unsigned long len, * Lock the VMAs before scanning for pages to migrate, to ensure we don't * miss a concurrently inserted page. */ - vma_iter_init(&vmi, mm, start); - for_each_vma_range(vmi, vma, end) - vma_start_write(vma); - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + flags | MPOL_MF_INVERT, &pagelist, true); if (ret < 0) { err = ret; -- cgit v1.2.3