diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 156 |
1 files changed, 76 insertions, 80 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -171,12 +171,7 @@ static int start_khugepaged(void) } static atomic_t huge_zero_refcount; -static struct page *huge_zero_page __read_mostly; - -static inline bool is_huge_zero_page(struct page *page) -{ - return ACCESS_ONCE(huge_zero_page) == page; -} +struct page *huge_zero_page __read_mostly; static inline bool is_huge_zero_pmd(pmd_t pmd) { @@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } -static inline struct page *alloc_hugepage_vma(int defrag, - struct vm_area_struct *vma, - unsigned long haddr, int nd, - gfp_t extra_gfp) -{ - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), - HPAGE_PMD_ORDER, vma, haddr, nd); -} - /* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { + gfp_t gfp; struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return 0; } - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - else + !transparent_hugepage_debug_cow()) { + gfp_t gfp; + + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + } else new_page = NULL; if (unlikely(!new_page)) { @@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; page = pmd_page(*pmd); @@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * check_same as the page may no longer be mapped. */ if (unlikely(pmd_trans_migrating(*pmdp))) { + page = pmd_page(*pmdp); spin_unlock(ptl); - wait_migrate_huge_page(vma->anon_vma, pmdp); + wait_on_page_locked(page); goto out; } @@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Migrate the THP to the requested node, returns with page unlocked - * and pmd_numa cleared. + * and access rights restored. */ spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - pmd = pmd_mknonnuma(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } -int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned char *vec) -{ - spinlock_t *ptl; - int ret = 0; - - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - /* - * All logical pages in the range are present - * if backed by a huge page. - */ - spin_unlock(ptl); - memset(vec, 1, (end - addr) >> PAGE_SHIFT); - ret = 1; - } - - return ret; -} - int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, @@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; - ret = 1; - if (!prot_numa) { + + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!prot_numa || !pmd_protnone(*pmd)) { + ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); - } else { - struct page *page = pmd_page(*pmd); - - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && - !pmd_numa(*pmd)) { - pmdp_set_numa(mm, addr, pmd); - ret = HPAGE_PMD_NR; - } } spin_unlock(ptl); } @@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, pte_t *pte, entry; BUG_ON(PageCompound(page+i)); /* - * Note that pmd_numa is not transferred deliberately - * to avoid any possibility that pte_numa leaks to - * a PROT_NONE VMA by accident. + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int referenced = 0, none = 0; + int none = 0; + bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; @@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, else goto out; } - if (!pte_present(pteval) || !pte_write(pteval)) + if (!pte_present(pteval)) goto out; page = vm_normal_page(vma, address, pteval); if (unlikely(!page)) @@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) - goto out; /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock @@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (!trylock_page(page)) goto out; + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + unlock_page(page); + goto out; + } + if (pte_write(pteval)) { + writable = true; + } else { + if (PageSwapCache(page) && !reuse_swap_page(page)) { + unlock_page(page); + goto out; + } + /* + * Page is not in the swap cache. It can be collapsed + * into a THP. + */ + } + /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. @@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, /* If there is no mapped pte young don't collapse the page */ if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) - referenced = 1; + referenced = true; } - if (likely(referenced)) + if (likely(referenced && writable)) return 1; out: release_pte_pages(pte, _pte); @@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, referenced = 0, none = 0; + int ret = 0, none = 0; struct page *page; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE; + bool writable = false, referenced = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, else goto out_unmap; } - if (!pte_present(pteval) || !pte_write(pteval)) + if (!pte_present(pteval)) goto out_unmap; + if (pte_write(pteval)) + writable = true; + page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; @@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; - /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) - referenced = 1; + referenced = true; } - if (referenced) + if (referenced && writable) ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); |