diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 240 |
1 files changed, 162 insertions, 78 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 097c7a4bfbd9..4b06b8db9df2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,6 +16,7 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> @@ -23,6 +24,8 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> +#include <linux/page_idle.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -104,7 +107,7 @@ static struct khugepaged_scan khugepaged_scan = { }; -static int set_recommended_min_free_kbytes(void) +static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; @@ -139,7 +142,6 @@ static int set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } setup_per_zone_wmarks(); - return 0; } static int start_stop_khugepaged(void) @@ -171,12 +173,7 @@ fail: static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -static inline bool is_huge_zero_pmd(pmd_t pmd) -{ - return is_huge_zero_page(pmd_page(pmd)); -} - -static struct page *get_huge_zero_page(void) +struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -716,21 +713,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + unsigned long address, pmd_t *pmd, + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; + unsigned long haddr = address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +753,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +778,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +865,51 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; + return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, + flags); +} + +static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t entry; + spinlock_t *ptl; + + ptl = pmd_lock(mm, pmd); + if (pmd_none(*pmd)) { + entry = pmd_mkhuge(pfn_pmd(pfn, prot)); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); } + spin_unlock(ptl); +} - count_vm_event(THP_FAULT_ALLOC); - return 0; +int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pmd_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + if (track_pfn_insert(vma, &pgprot, pfn)) + return VM_FAULT_SIGBUS; + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + return VM_FAULT_NOPAGE; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +944,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -1387,41 +1456,41 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + pmd_t orig_pmd; spinlock_t *ptl; - int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - struct page *page; - pgtable_t pgtable; - pmd_t orig_pmd; - /* - * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_huge_get_and_clear. So do the - * pgtable_trans_huge_withdraw after finishing pmdp related - * operations. - */ - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); - tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - if (is_huge_zero_pmd(orig_pmd)) { - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); + if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + if (is_huge_zero_pmd(orig_pmd)) put_huge_zero_page(); - } else { - page = pmd_page(orig_pmd); - page_remove_rmap(page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON_PAGE(!PageHead(page), page); - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); - tlb_remove_page(tlb, page); - } - pte_free(tlb->mm, pgtable); - ret = 1; + } else if (is_huge_zero_pmd(orig_pmd)) { + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); + } else { + struct page *page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON_PAGE(!PageHead(page), page); + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + tlb_remove_page(tlb, page); } - return ret; + return 1; } int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, @@ -1689,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page, /* clear PageTail before overwriting first_page */ smp_wmb(); + if (page_is_young(page)) + set_page_young(page_tail); + if (page_is_idle(page)) + set_page_idle(page_tail); + /* * __split_huge_page_splitting() already set the * splitting bit in all pmd that could map this @@ -2133,7 +2207,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2193,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2257,8 +2333,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void khugepaged_alloc_sleep(void) { - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + freezable_schedule_timeout_interruptible( + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); } static int khugepaged_node_load[MAX_NUMNODES]; @@ -2345,7 +2425,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); + *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2586,7 +2666,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; @@ -2619,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2882,7 +2964,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2895,25 +2977,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; - } - if (is_huge_zero_pmd(*pmd)) { + if (unlikely(!pmd_trans_huge(*pmd))) + goto unlock; + if (vma_is_dax(vma)) { + pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + if (is_huge_zero_pmd(_pmd)) + put_huge_zero_page(); + } else if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; + } else { + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + get_page(page); } - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + unlock: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - split_huge_page(page); + if (!page) + return; + split_huge_page(page); put_page(page); /* @@ -2962,7 +3046,7 @@ static void split_huge_page_address(struct mm_struct *mm, split_huge_page_pmd_mm(mm, address, pmd); } -void __vma_adjust_trans_huge(struct vm_area_struct *vma, +void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) |