diff options
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r-- | mm/khugepaged.c | 148 |
1 files changed, 81 insertions, 67 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..a55fb1dcd224 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include <linux/rcupdate_wait.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> +#include <linux/dax.h> #include <linux/ksm.h> #include <asm/tlb.h> @@ -346,7 +347,7 @@ struct attribute_group khugepaged_attr_group = { #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -469,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm) } void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { @@ -547,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct folio *folio) -{ - int expected_refcount = folio_mapcount(folio); - - if (!folio_test_anon(folio) || folio_test_swapcache(folio)) - expected_refcount += folio_nr_pages(folio); - - if (folio_test_private(folio)) - expected_refcount++; - - return folio_ref_count(folio) == expected_refcount; -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -606,7 +594,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -651,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -695,13 +683,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } @@ -745,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_folio_and_swap_cache(src); } } @@ -947,30 +935,40 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; - - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; + pmd_t pmde = pmdp_get_lockless(pmd); - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; + + /* + * The folio may be under migration when khugepaged is trying to + * collapse it. Migration success or failure will eventually end + * up with a present PMD mapping a folio again. + */ + if (is_pmd_migration_entry(pmde)) + return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; - if (pmd_devmap(pmde)) - return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -1234,7 +1232,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = folio_mk_pmd(folio, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1354,11 +1352,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * We treat a single page as shared if any part of the THP - * is shared. "False negatives" from - * folio_likely_mapped_shared() are not expected to matter - * much in practice. + * is shared. */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -1399,7 +1395,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1432,7 +1428,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, none_or_zero, result, unmapped); return result; } @@ -1461,10 +1457,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } } -#ifdef CONFIG_SHMEM -/* hpage must be locked, and mmap_lock must be held */ +/* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct page *hpage) + pmd_t *pmdp, struct folio *folio, struct page *page) { struct vm_fault vmf = { .vma = vma, @@ -1473,13 +1468,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, .pmd = pmdp, }; - VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); - if (do_set_pmd(&vmf, hpage)) + if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; - get_page(hpage); + folio_get(folio); return SCAN_SUCCEED; } @@ -1686,7 +1680,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, &folio->page) + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: @@ -1720,7 +1714,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; + bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that @@ -1757,6 +1751,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); @@ -1770,20 +1777,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); @@ -1837,6 +1844,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); @@ -2277,6 +2286,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ @@ -2287,23 +2307,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * it's safe to skip LRU and refcount checks before * returning. */ + folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } @@ -2315,6 +2339,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, */ present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { xas_pause(&xas); @@ -2336,14 +2361,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2419,7 +2436,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2718,8 +2735,8 @@ static int madvise_collapse_errno(enum scan_result r) } } -int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; @@ -2730,8 +2747,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; @@ -2765,7 +2780,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); @@ -2779,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, &mmap_locked, cc); } if (!mmap_locked) - *prev = NULL; /* Tell caller we dropped mmap_lock */ + *lock_dropped = true; handle_result: switch (result) { @@ -2789,7 +2804,6 @@ handle_result: break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); - BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); |