diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-15 04:25:18 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-15 04:25:18 +0300 |
commit | a57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch) | |
tree | 5a42ee9a668f171143464bc86013954c1bbe94ad /mm | |
parent | cf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff) | |
parent | e1e14ab8411df344a17687821f8f78f0a1e73cbb (diff) | |
download | linux-a57cb1c1d7974c62a5c80f7869e35b492ace12cd.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- a few misc things
- kexec updates
- DMA-mapping updates to better support networking DMA operations
- IPC updates
- various MM changes to improve DAX fault handling
- lots of radix-tree changes, mainly to the test suite. All leading up
to reimplementing the IDA/IDR code to be a wrapper layer over the
radix-tree. However the final trigger-pulling patch is held off for
4.11.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits)
radix tree test suite: delete unused rcupdate.c
radix tree test suite: add new tag check
radix-tree: ensure counts are initialised
radix tree test suite: cache recently freed objects
radix tree test suite: add some more functionality
idr: reduce the number of bits per level from 8 to 6
rxrpc: abstract away knowledge of IDR internals
tpm: use idr_find(), not idr_find_slowpath()
idr: add ida_is_empty
radix tree test suite: check multiorder iteration
radix-tree: fix replacement for multiorder entries
radix-tree: add radix_tree_split_preload()
radix-tree: add radix_tree_split
radix-tree: add radix_tree_join
radix-tree: delete radix_tree_range_tag_if_tagged()
radix-tree: delete radix_tree_locate_item()
radix-tree: improve multiorder iterators
btrfs: fix race in btrfs_free_dummy_fs_info()
radix-tree: improve dump output
radix-tree: make radix_tree_find_next_bit more useful
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 17 | ||||
-rw-r--r-- | mm/filemap.c | 14 | ||||
-rw-r--r-- | mm/gup.c | 20 | ||||
-rw-r--r-- | mm/huge_memory.c | 173 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/khugepaged.c | 31 | ||||
-rw-r--r-- | mm/memory.c | 859 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/page-writeback.c | 28 | ||||
-rw-r--r-- | mm/page_alloc.c | 14 | ||||
-rw-r--r-- | mm/process_vm_access.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 32 |
12 files changed, 643 insertions, 569 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 223464227299..949198d01260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page_count(page) > page_mapcount(page)) goto isolate_fail; + /* + * Only allow to migrate anonymous pages in GFP_NOFS context + * because those do not depend on fs locks. + */ + if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) + goto isolate_fail; + /* If we already hold the lock, we can skip some rechecking */ if (!locked) { locked = compact_trylock_irqsave(zone_lru_lock(zone), @@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio) { - int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* Check if the GFP flags allow compaction */ - if (!may_enter_fs || !may_perform_io) + /* + * Check if the GFP flags allow compaction - GFP_NOIO is really + * tricky context because the migration might require IO + */ + if (!may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); @@ -1751,6 +1760,7 @@ static void compact_node(int nid) .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, + .gfp_mask = GFP_KERNEL, }; @@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, + .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, diff --git a/mm/filemap.c b/mm/filemap.c index b06517b7f97f..32be3c8f3a11 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2164,12 +2164,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2225,11 +2225,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2239,7 +2239,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..10eedbf14421 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -879,30 +879,30 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, + vmf->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -993,7 +993,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -1005,23 +1005,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1053,12 +1053,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1070,7 +1070,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1090,11 +1090,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1102,12 +1102,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1117,13 +1117,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } @@ -1196,12 +1196,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1304,18 +1304,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..e32389a97030 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; - if (!is_swap_pte(pteval)) + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1546,7 +1546,6 @@ tree_unlocked: /* Put holes back where they were */ radix_tree_delete(&mapping->page_tree, iter.index); - slot = radix_tree_iter_next(&iter); continue; } @@ -1557,11 +1556,11 @@ tree_unlocked: page_ref_unfreeze(page, 2); radix_tree_replace_slot(&mapping->page_tree, slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); - slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); @@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(page); - - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); - put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return VM_FAULT_WRITE; + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } /* @@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2263,79 +2269,91 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, fe->address), - .virtual_address = - (void __user *)(fe->address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) - __releases(fe->ptl) +static int wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - int page_mkwrite = 0; + struct vm_area_struct *vma = vmf->vma; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); - return 0; + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; } - page_mkwrite = 1; + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); } + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return VM_FAULT_WRITE; } /* @@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - struct page *old_page; + struct vm_area_struct *vma = vmf->vma; - old_page = vm_normal_page(vma, fe->address, orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + lock_page(vmf->page); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + unlock_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + unlock_page(vmf->page); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2863,50 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + ret = vma->vm_ops->fault(vma, vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) return ret; - } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf->page); + put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2921,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; @@ -3005,13 +3013,13 @@ out: * withdraw with pmd lock held. */ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + vmf->pmd); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) goto fault_handled; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { + if (unlikely(!pte_none(*vmf->pte))) { ret = VM_FAULT_NOPAGE; goto fault_handled; } @@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); ret = 0; fault_handled: /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; + if (vmf->prealloc_pte) { + pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; } return ret; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; + struct vm_area_struct *vma = vmf->vma; int ret = 0; /* @@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - unlock_page(fault_page); + ret |= finish_fault(vmf); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page, *new_page; - void *fault_entry; - struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - put_page(new_page); + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + &vmf->memcg, false)) { + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); - } + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + put_page(vmf->cow_page); return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; - struct address_space *mapping; - int dirtied = 0; + struct vm_area_struct *vma = vmf->vma; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } @@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return do_read_fault(vmf); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + return do_cow_fault(vmf); + return do_shared_fault(vmf); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3476,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); - - entry = *fe->pte; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(fe, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,10 +3613,12 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* @@ -3808,8 +3817,8 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp) { int res; @@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/mm/nommu.c b/mm/nommu.c index 27bc543128e5..210d7ec2843c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e3b472..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bcb43b7..2c6d5f64feca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } +void __page_frag_drain(struct page *page, unsigned int order, + unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) { + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} +EXPORT_SYMBOL(__page_frag_drain); + void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..54287d443806 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); |