diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 341 |
1 files changed, 161 insertions, 180 deletions
diff --git a/mm/memory.c b/mm/memory.c index f69fbc251198..f758f59f3704 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +746,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1012,13 +1016,25 @@ again: progress = 0; init_rss_vec(rss); + /* + * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the + * error handling here, assume that exclusive mmap_lock on dst and src + * protects anon from unexpected THP transitions; with shmem and file + * protected by mmap_lock-less collapse skipping areas with anon_vma + * (whereas vma_needs_copy() skips areas without anon_vma). A rework + * can remove such assumptions later, but this is good enough for now. + */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } - src_pte = pte_offset_map(src_pmd, addr); - src_ptl = pte_lockptr(src_mm, src_pmd); + src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); + if (!src_pte) { + pte_unmap_unlock(dst_pte, dst_ptl); + /* ret == 0 */ + goto out; + } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; @@ -1035,17 +1051,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1083,8 +1100,7 @@ again: } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - spin_unlock(src_ptl); - pte_unmap(orig_src_pte); + pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -1388,14 +1404,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, swp_entry_t entry; tlb_change_page_size(tlb, PAGE_SIZE); -again: init_rss_vec(rss); - start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - pte = start_pte; + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return addr; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1507,17 +1524,10 @@ again: * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched - * memory too. Restart if we didn't do everything. + * memory too. Come back again if we didn't do everything. */ - if (force_flush) { - force_flush = 0; + if (force_flush) tlb_flush_mmu(tlb); - } - - if (addr != end) { - cond_resched(); - goto again; - } return addr; } @@ -1536,8 +1546,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) - goto next; + else if (zap_huge_pmd(tlb, vma, pmd, addr)) { + addr = next; + continue; + } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1550,20 +1562,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - - /* - * Here there can be other concurrent MADV_DONTNEED or - * trans huge page faults running, and if the pmd is - * none or trans huge it can change under us. This is - * because MADV_DONTNEED holds the mmap_lock in read - * mode. - */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - goto next; - next = zap_pte_range(tlb, vma, pmd, addr, next, details); -next: - cond_resched(); - } while (pmd++, addr = next, addr != end); + if (pmd_none(*pmd)) { + addr = next; + continue; + } + addr = zap_pte_range(tlb, vma, pmd, addr, next, details); + if (addr != next) + pmd--; + } while (pmd++, cond_resched(), addr != end); return addr; } @@ -1821,7 +1827,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -1905,6 +1911,10 @@ more: const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + if (!start_pte) { + ret = -EFAULT; + goto out; + } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); @@ -2111,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2123,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2339,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2572,15 +2583,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -EINVAL; } - BUG_ON(pmd_huge(*pmd)); - arch_enter_lazy_mmu_mode(); if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2781,10 +2792,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf) int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - spin_lock(ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); - spin_unlock(ptl); + spin_lock(vmf->ptl); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); + spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); @@ -2804,7 +2814,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, int ret; void *kaddr; void __user *uaddr; - bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; @@ -2830,17 +2839,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ + vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - locked = true; - if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ - update_mmu_tlb(vma, addr, vmf->pte); + if (vmf->pte) + update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } @@ -2857,15 +2867,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { - if (locked) + if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - locked = true; - if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ - update_mmu_tlb(vma, addr, vmf->pte); + if (vmf->pte) + update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } @@ -2888,7 +2898,7 @@ warn: ret = 0; pte_unlock: - if (locked) + if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst); @@ -3110,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3178,19 +3188,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) /* Free the old page.. */ new_folio = old_folio; page_copied = 1; - } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } - if (new_folio) - folio_put(new_folio); - - pte_unmap_unlock(vmf->pte, vmf->ptl); /* * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); + + if (new_folio) + folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(&old_folio->page); @@ -3230,11 +3241,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3329,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3388,8 +3401,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) goto copy; if (!folio_test_lru(folio)) /* - * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to LRU folios. + * We cannot easily detect+handle references from + * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) @@ -3591,10 +3604,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); @@ -3625,6 +3639,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. @@ -3633,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3728,10 +3744,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { - spin_unlock(vmf->ptl); - goto out; - } + if (unlikely(!vmf->pte || + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) + goto unlock; /* * Get a page reference while we know the page can't be @@ -3807,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3863,7 +3880,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU - * pagevecs if required. + * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) @@ -3877,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4003,13 +4020,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); out: if (si) put_swap_device(si); return ret; out_nomap: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: @@ -4041,22 +4060,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Use pte_alloc() instead of pte_alloc_map(). We can't run - * pte_offset_map() on pmds where a huge pmd might be created - * from a different thread. - * - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when - * parallel threads are excluded by other means. - * - * Here we only have mmap_read_lock(mm). + * Use pte_alloc() instead of pte_alloc_map(), so that OOM can + * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See comment in handle_pte_fault() */ - if (unlikely(pmd_trans_unstable(vmf->pmd))) - return 0; - /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { @@ -4064,6 +4073,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; @@ -4104,6 +4115,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + goto release; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; @@ -4131,7 +4144,8 @@ setpte: /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); @@ -4325,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4380,15 +4394,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - /* - * See comment in handle_pte_fault() for how this scenario happens, we - * need to return NOPAGE so that we drop this page. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + if (!vmf->pte) + return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { @@ -4630,17 +4639,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { - /* - * If we find a migration pmd entry or a none pmd entry, which - * should never happen, return SIGBUS - */ - if (unlikely(!pmd_present(*vmf->pmd))) + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, - vmf->pmd, - vmf->address, - &vmf->ptl); /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update @@ -4648,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4703,9 +4706,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. */ - vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4774,9 +4776,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + goto out; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4905,38 +4909,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead - * of pmd_trans_huge() to ensure the pmd didn't become - * pmd_trans_huge under us and then back to pmd_none, as a - * result of MADV_DONTNEED running immediately after a huge pmd - * fault in a different thread of this mm, in turn leading to a - * misleading pmd_trans_huge() retval. All we have to ensure is - * that it is a regular pmd that we can walk with - * pte_offset_map() and we can do that through an atomic read - * in C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return 0; - /* * A regular pmd is established and it can't morph into a huge - * pmd from under us anymore at this point because we hold the - * mmap_lock read mode and khugepaged takes it in write mode. - * So now it's safe to run pte_offset_map(). + * pmd by anon khugepaged, since that takes mmap_lock in write + * mode; but shmem or file collapse to THP could still morph + * it into a huge pmd: just retry later if so. */ - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - vmf->orig_pte = *vmf->pte; + vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic - * accesses. The code below just needs a consistent view - * for the ifs and we later double check anyway with the - * ptl lock held. So here a barrier will do. - */ - barrier(); if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; @@ -4952,10 +4936,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5060,9 +5043,8 @@ retry_pud: if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - vmf.orig_pmd = *vmf.pmd; + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); - barrier(); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); @@ -5439,11 +5421,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!pte_present(*ptep)) + if (!ptep) + goto out; + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5480,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5500,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5544,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5560,7 +5541,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); @@ -5587,7 +5568,6 @@ EXPORT_SYMBOL_GPL(generic_access_phys); int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; @@ -5596,29 +5576,30 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, /* ignore errors, just check how much was successfully transferred */ while (len) { - int bytes, ret, offset; + int bytes, offset; void *maddr; - struct page *page = NULL; + struct vm_area_struct *vma = NULL; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); - ret = get_user_pages_remote(mm, addr, 1, - gup_flags, &page, &vma, NULL); - if (ret <= 0) { + if (IS_ERR_OR_NULL(page)) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; #else + int res = 0; + /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = vma_lookup(mm, addr); if (!vma) break; if (vma->vm_ops && vma->vm_ops->access) - ret = vma->vm_ops->access(vma, addr, buf, + res = vma->vm_ops->access(vma, addr, buf, len, write); - if (ret <= 0) + if (res <= 0) break; - bytes = ret; + bytes = res; #endif } else { bytes = len; |