diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 295 |
1 files changed, 186 insertions, 109 deletions
diff --git a/mm/memory.c b/mm/memory.c index b6015e230822..21dea111ffb2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1362,12 +1362,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next, pfn; bool is_cow; int ret; @@ -1378,11 +1378,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); + ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } @@ -1419,7 +1415,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1429,6 +1424,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) + untrack_pfn_copy(dst_vma, pfn); return ret; } @@ -1436,7 +1433,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1463,42 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } +#endif + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1524,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,7 +1549,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; @@ -1559,15 +1564,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible @@ -1579,14 +1586,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + swp_entry_t entry; + int nr = 1; + + *any_skipped = true; + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + if (is_device_private_entry(entry)) + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (is_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1712,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = true; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1728,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; + bool any_skipped = false; - nr = 1; - if (pte_none(ptent)) - continue; - - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; - - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; } - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + direct_reclaim = false; + break; } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1776,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1935,7 +2012,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_notifier_range range; struct mmu_gather tlb; - lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); @@ -2828,11 +2904,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -3015,7 +3091,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -4191,8 +4266,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -5632,7 +5709,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -6076,7 +6153,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6192,7 +6270,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r /* * Helper for page fault handling. * - * This is kind of equivalend to "mmap_read_lock()" followed + * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * @@ -6753,10 +6831,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif @@ -6968,7 +7044,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif |