summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 22:32:41 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 22:32:41 +0300
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/memory.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
downloadlinux-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.xz
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c573
1 files changed, 403 insertions, 170 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2a12028a3749..54bcd5327b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -86,6 +86,7 @@
#include "pgalloc-track.h"
#include "internal.h"
+#include "swap.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
@@ -99,6 +100,8 @@ struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
+static vm_fault_t do_fault(struct vm_fault *vmf);
+
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -720,12 +723,14 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
+
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
*/
if (PageAnon(page))
- page_add_anon_rmap(page, vma, address, false);
+ page_add_anon_rmap(page, vma, address, RMAP_NONE);
else
/*
* Currently device exclusive access only supports anonymous
@@ -790,17 +795,23 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
+ /* Mark the swap entry as shared. */
+ if (pte_swp_exclusive(*src_pte)) {
+ pte = pte_swp_clear_exclusive(*src_pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
- if (is_writable_migration_entry(entry) &&
+ if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
- * COW mappings require pages in both
- * parent and child to be set to read.
+ * COW mappings require pages in both parent and child
+ * to be set to read. A previously exclusive entry is
+ * now shared.
*/
entry = make_readable_migration_entry(
swp_offset(entry));
@@ -825,7 +836,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
get_page(page);
rss[mm_counter(page)]++;
- page_dup_rmap(page, false);
+ /* Cannot fail as these pages cannot get pinned. */
+ BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
/*
* We do not preserve soft-dirty information, because so
@@ -854,6 +866,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (try_restore_exclusive_pte(src_pte, src_vma, addr))
return -EBUSY;
return -ENOENT;
+ } else if (is_pte_marker_entry(entry)) {
+ /*
+ * We're copying the pgtable should only because dst_vma has
+ * uffd-wp enabled, do sanity check.
+ */
+ WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
@@ -862,19 +882,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/*
- * Copy a present and normal page if necessary.
+ * Copy a present and normal page.
*
- * NOTE! The usual case is that this doesn't need to do
- * anything, and can just return a positive value. That
- * will let the caller know that it can just increase
- * the page refcount and re-use the pte the traditional
- * way.
- *
- * But _if_ we need to copy it because it needs to be
- * pinned in the parent (and the child should get its own
- * copy rather than just a reference to the same page),
- * we'll do that here and return zero to let the caller
- * know we're done.
+ * NOTE! The usual case is that this isn't required;
+ * instead, the caller can just increase the page refcount
+ * and re-use the pte the traditional way.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
@@ -884,25 +896,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc, pte_t pte, struct page *page)
+ struct page **prealloc, struct page *page)
{
struct page *new_page;
-
- /*
- * What we want to do is to check whether this page may
- * have been pinned by the parent process. If so,
- * instead of wrprotect the pte on both sides, we copy
- * the page immediately so that we'll always guarantee
- * the pinned page won't be randomly replaced in the
- * future.
- *
- * The page pinning checks are just "has this mm ever
- * seen pinning", along with the (inexact) check of
- * the page count. That might give false positives for
- * for pinning, but it will work correctly.
- */
- if (likely(!page_needs_cow_for_dma(src_vma, page)))
- return 1;
+ pte_t pte;
new_page = *prealloc;
if (!new_page)
@@ -915,7 +912,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
*prealloc = NULL;
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ page_add_new_anon_rmap(new_page, dst_vma, addr);
lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
@@ -944,16 +941,24 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct page *page;
page = vm_normal_page(src_vma, addr, pte);
- if (page) {
- int retval;
-
- retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, pte, page);
- if (retval <= 0)
- return retval;
-
+ if (page && PageAnon(page)) {
+ /*
+ * If this page may have been pinned by the parent process,
+ * copy the page immediately for the child so that we'll always
+ * guarantee the pinned page won't be randomly replaced in the
+ * future.
+ */
get_page(page);
- page_dup_rmap(page, false);
+ if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
+ /* Page maybe pinned, we have to copy. */
+ put_page(page);
+ return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ }
+ rss[mm_counter(page)]++;
+ } else if (page) {
+ get_page(page);
+ page_dup_file_rmap(page, false);
rss[mm_counter(page)]++;
}
@@ -965,6 +970,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
+ VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page));
/*
* If it's a shared mapping, mark it clean in
@@ -1222,6 +1228,38 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
return 0;
}
+/*
+ * Return true if the vma needs to copy the pgtable during this fork(). Return
+ * false when we can speed up fork() by allowing lazy page faults later until
+ * when the child accesses the memory range.
+ */
+static bool
+vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+ /*
+ * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
+ * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
+ * contains uffd-wp protection information, that's something we can't
+ * retrieve from page cache, and skip copying will lose those info.
+ */
+ if (userfaultfd_wp(dst_vma))
+ return true;
+
+ if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP))
+ return true;
+
+ if (src_vma->anon_vma)
+ return true;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly. Fork
+ * becomes much lighter when there are big shared or private readonly
+ * mappings. The tradeoff is that copy_page_range is more efficient
+ * than faulting.
+ */
+ return false;
+}
+
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
@@ -1235,18 +1273,11 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
bool is_cow;
int ret;
- /*
- * Don't copy ptes where a page fault will fill them correctly.
- * Fork becomes much lighter when there are big shared or private
- * readonly mappings. The tradeoff is that copy_page_range is more
- * efficient than faulting.
- */
- if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !src_vma->anon_vma)
+ if (!vma_needs_copy(dst_vma, src_vma))
return 0;
if (is_vm_hugetlb_page(src_vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+ return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
@@ -1308,6 +1339,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ zap_flags_t zap_flags; /* Extra flags for zapping */
};
/* Whether we should zap all COWed (private) pages too */
@@ -1336,6 +1368,29 @@ static inline bool should_zap_page(struct zap_details *details, struct page *pag
return !PageAnon(page);
}
+static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+{
+ if (!details)
+ return false;
+
+ return details->zap_flags & ZAP_FLAG_DROP_MARKER;
+}
+
+/*
+ * This function makes sure that we'll replace the none pte with an uffd-wp
+ * swap special pte marker when necessary. Must be with the pgtable lock held.
+ */
+static inline void
+zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte,
+ struct zap_details *details, pte_t pteval)
+{
+ if (zap_drop_file_uffd_wp(details))
+ return;
+
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1373,6 +1428,8 @@ again:
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details,
+ ptent);
if (unlikely(!page))
continue;
@@ -1403,6 +1460,13 @@ again:
page = pfn_swap_entry_to_page(entry);
if (unlikely(!should_zap_page(details, page)))
continue;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
rss[mm_counter(page)]--;
if (is_device_private_entry(entry))
page_remove_rmap(page, vma, false);
@@ -1419,6 +1483,10 @@ again:
if (!should_zap_page(details, page))
continue;
rss[mm_counter(page)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /* Only drop the uffd-wp marker if explicitly requested */
+ if (!zap_drop_file_uffd_wp(details))
+ continue;
} else if (is_hwpoison_entry(entry)) {
if (!should_zap_cows(details))
continue;
@@ -1427,6 +1495,7 @@ again:
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
@@ -1605,8 +1674,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* safe to do nothing in this case.
*/
if (vma->vm_file) {
+ zap_flags_t zap_flags = details ?
+ details->zap_flags : 0;
i_mmap_lock_write(vma->vm_file->f_mapping);
- __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+ __unmap_hugepage_range_final(tlb, vma, start, end,
+ NULL, zap_flags);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
@@ -1637,12 +1709,17 @@ void unmap_vmas(struct mmu_gather *tlb,
unsigned long end_addr)
{
struct mmu_notifier_range range;
+ struct zap_details details = {
+ .zap_flags = ZAP_FLAG_DROP_MARKER,
+ /* Careful - we need to zap private pages too! */
+ .even_cows = true,
+ };
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
- unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+ unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2755,8 +2832,8 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
return same;
}
-static inline bool cow_user_page(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
bool ret;
void *kaddr;
@@ -2963,6 +3040,10 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
+
+ VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
+
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
@@ -2981,7 +3062,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
}
/*
- * Handle the case of a page which we actually need to copy to a new page.
+ * Handle the case of a page which we actually need to copy to a new page,
+ * either due to COW or unsharing.
*
* Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
@@ -2998,6 +3080,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
*/
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
@@ -3020,7 +3103,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (!new_page)
goto oom;
- if (!cow_user_page(new_page, old_page, vmf)) {
+ if (!__wp_page_copy_user(new_page, old_page, vmf)) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
@@ -3062,7 +3145,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(unshare)) {
+ if (pte_soft_dirty(vmf->orig_pte))
+ entry = pte_mksoft_dirty(entry);
+ if (pte_uffd_wp(vmf->orig_pte))
+ entry = pte_mkuffd_wp(entry);
+ } else {
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
/*
* Clear the pte entry and flush it first, before updating the
@@ -3072,13 +3162,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+ page_add_new_anon_rmap(new_page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
+ BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
@@ -3128,7 +3219,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
free_swap_cache(old_page);
put_page(old_page);
}
- return page_copied ? VM_FAULT_WRITE : 0;
+ return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
oom_free_new:
put_page(new_page);
oom:
@@ -3228,18 +3319,22 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
+ * This routine handles present pages, when
+ * * users try to write to a shared page (FAULT_FLAG_WRITE)
+ * * GUP wants to take a R/O pin on a possibly shared anonymous page
+ * (FAULT_FLAG_UNSHARE)
+ *
+ * It is done by copying the page to a new address and decrementing the
+ * shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
+ * done any necessary COW.
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
+ * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
+ * though the page will change only once the write actually happens. This
+ * avoids a few races, and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
@@ -3248,23 +3343,35 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_WP);
- }
+ VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
- /*
- * Userfaultfd write-protect can defer flushes. Ensure the TLB
- * is flushed in this case before copying.
- */
- if (unlikely(userfaultfd_wp(vmf->vma) &&
- mm_tlb_flush_pending(vmf->vma->vm_mm)))
- flush_tlb_page(vmf->vma, vmf->address);
+ if (likely(!unshare)) {
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
+ }
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
+ if (unlikely(unshare)) {
+ /* No anonymous page -> nothing to do. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -3288,6 +3395,13 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct page *page = vmf->page;
/*
+ * If the page is exclusive to this process we must reuse the
+ * page without further checks.
+ */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
+ /*
* We have to verify under page lock: these early checks are
* just an optimization to avoid locking the page and freeing
* the swapcache if there is little hope that we can reuse.
@@ -3317,9 +3431,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* and the page is locked, it's dark out, and we're wearing
* sunglasses. Hit it.
*/
+ page_move_anon_rmap(page, vma);
unlock_page(page);
+reuse:
+ if (unlikely(unshare)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
+ } else if (unshare) {
+ /* No anonymous page -> nothing to do. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
@@ -3331,6 +3455,10 @@ copy:
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+#ifdef CONFIG_KSM
+ if (PageKsm(vmf->page))
+ count_vm_event(COW_KSM);
+#endif
return wp_page_copy(vmf);
}
@@ -3387,6 +3515,7 @@ void unmap_mapping_folio(struct folio *folio)
details.even_cows = false;
details.single_folio = folio;
+ details.zap_flags = ZAP_FLAG_DROP_MARKER;
i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
@@ -3508,6 +3637,59 @@ static inline bool should_try_to_free_swap(struct page *page,
page_count(page) == 2;
}
+static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
+{
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ /*
+ * Be careful so that we will only recover a special uffd-wp pte into a
+ * none pte. Otherwise it means the pte could have changed, so retry.
+ */
+ if (is_pte_marker(*vmf->pte))
+ pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+}
+
+/*
+ * This is actually a page-missing access, but with uffd-wp special pte
+ * installed. It means this pte was wr-protected before being unmapped.
+ */
+static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
+{
+ /*
+ * Just in case there're leftover special ptes even after the region
+ * got unregistered - we can simply clear them. We can also do that
+ * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
+ * ranges, but it should be more efficient to be done lazily here.
+ */
+ if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+ return pte_marker_clear(vmf);
+
+ /* do_fault() can handle pte markers too like none pte */
+ return do_fault(vmf);
+}
+
+static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+{
+ swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
+ unsigned long marker = pte_marker_get(entry);
+
+ /*
+ * PTE markers should always be with file-backed memories, and the
+ * marker should never be empty. If anything weird happened, the best
+ * thing to do is to kill the process along with its mm.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+ return VM_FAULT_SIGBUS;
+
+ if (pte_marker_entry_uffd_wp(entry))
+ return pte_marker_handle_uffd_wp(vmf);
+
+ /* This is an unknown pte marker */
+ return VM_FAULT_SIGBUS;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3521,10 +3703,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
struct swap_info_struct *si = NULL;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive = false;
swp_entry_t entry;
pte_t pte;
int locked;
- int exclusive = 0;
vm_fault_t ret = 0;
void *shadow = NULL;
@@ -3544,6 +3727,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_pte_marker_entry(entry)) {
+ ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
@@ -3585,7 +3770,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
- swap_readpage(page, true);
+ swap_readpage(page, true, NULL);
set_page_private(page, 0);
}
} else {
@@ -3677,6 +3862,57 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
+ * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
+ * must never point at an anonymous page in the swapcache that is
+ * PG_anon_exclusive. Sanity check that this holds and especially, that
+ * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
+ * check after taking the PT lock and making sure that nobody
+ * concurrently faulted in this page and set PG_anon_exclusive.
+ */
+ BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+ BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
+ /*
+ * Check under PT lock (to protect against concurrent fork() sharing
+ * the swap entry concurrently) for certainly exclusive pages.
+ */
+ if (!PageKsm(page)) {
+ /*
+ * Note that pte_swp_exclusive() == false for architectures
+ * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
+ */
+ exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (page != swapcache) {
+ /*
+ * We have a fresh page that is not exposed to the
+ * swapcache -> certainly exclusive.
+ */
+ exclusive = true;
+ } else if (exclusive && PageWriteback(page) &&
+ data_race(si->flags & SWP_STABLE_WRITES)) {
+ /*
+ * This is tricky: not all swap backends support
+ * concurrent page modifications while under writeback.
+ *
+ * So if we stumble over such a page in the swapcache
+ * we must not set the page exclusive, otherwise we can
+ * map it writable without further checks and modify it
+ * while still under writeback.
+ *
+ * For these problematic swap backends, simply drop the
+ * exclusive marker: this is perfectly fine as we start
+ * writeback only if we fully unmapped the page and
+ * there are no unexpected references on the page after
+ * unmapping succeeded. After fully unmapped, no
+ * further GUP references (FOLL_GET and FOLL_PIN) can
+ * appear, so dropping the exclusive marker and mapping
+ * it only R/O is fine.
+ */
+ exclusive = false;
+ }
+ }
+
+ /*
* Remove the swap entry and conditionally try to free up the swapcache.
* We're already holding a reference on the page but haven't mapped it
* yet.
@@ -3690,16 +3926,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = mk_pte(page, vma->vm_page_prot);
/*
- * Same logic as in do_wp_page(); however, optimize for fresh pages
- * that are certainly not shared because we just allocated them without
- * exposing them to the swapcache.
+ * Same logic as in do_wp_page(); however, optimize for pages that are
+ * certainly not shared either because we just allocated them without
+ * exposing them to the swapcache or because the swap entry indicates
+ * exclusivity.
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
- (page != swapcache || page_count(page) == 1)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
- exclusive = RMAP_EXCLUSIVE;
+ if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ vmf->flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ }
+ rmap_flags |= RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
@@ -3712,12 +3950,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
+ VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
@@ -3862,7 +4101,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -4032,6 +4271,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
+ bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
@@ -4046,10 +4286,12 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(uffd_wp))
+ entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
@@ -4058,6 +4300,14 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
+static bool vmf_pte_changed(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
+ return !pte_same(*vmf->pte, vmf->orig_pte);
+
+ return !pte_none(*vmf->pte);
+}
+
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
@@ -4116,7 +4366,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */
- if (likely(pte_none(*vmf->pte)))
+ if (likely(!vmf_pte_changed(vmf)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
@@ -4219,9 +4469,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
}
+/* Return true if we should do read fault-around, false otherwise */
+static inline bool should_fault_around(struct vm_fault *vmf)
+{
+ /* No ->map_pages? No way to fault around... */
+ if (!vmf->vma->vm_ops->map_pages)
+ return false;
+
+ if (uffd_disable_fault_around(vmf->vma))
+ return false;
+
+ return fault_around_bytes >> PAGE_SHIFT > 1;
+}
+
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
@@ -4229,12 +4491,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
- if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- if (likely(!userfaultfd_minor(vmf->vma))) {
- ret = do_fault_around(vmf);
- if (ret)
- return ret;
- }
+ if (should_fault_around(vmf)) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
}
ret = __do_fault(vmf);
@@ -4504,8 +4764,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+
if (vma_is_anonymous(vmf->vma)) {
- if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+ if (likely(!unshare) &&
+ userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
@@ -4581,6 +4844,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
+ vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
/*
* If a huge pmd materialized under us just retry later. Use
@@ -4604,6 +4868,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
+ vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
/*
* some architectures can have larger ptes than wordsize,
@@ -4640,10 +4905,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
- if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!pte_write(entry))
return do_wp_page(vmf);
- entry = pte_mkdirty(entry);
+ else if (likely(vmf->flags & FAULT_FLAG_WRITE))
+ entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
@@ -4684,7 +4950,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
@@ -4709,9 +4974,11 @@ retry_pud:
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- /* NUMA case for anonymous PUDs would go here */
-
- if (dirty && !pud_write(orig_pud)) {
+ /*
+ * TODO once we support anonymous PUDs: NUMA case and
+ * FAULT_FLAG_UNSHARE handling.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4749,7 +5016,8 @@ retry_pud:
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
- if (dirty && !pmd_write(vmf.orig_pmd)) {
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !pmd_write(vmf.orig_pmd)) {
ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4949,9 +5217,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4974,35 +5262,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_huge(*pmd)) {
- if (!pmdpp)
- goto out;
-
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
- NULL, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
- *ptlp = pmd_lock(mm, pmd);
- if (pmd_huge(*pmd)) {
- *pmdpp = pmd;
- return 0;
- }
- spin_unlock(*ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
- }
-
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5010,38 +5272,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
-
-/**
- * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
- *
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location is stored in @ptlp.
- * The contents of the PTE are only stable until @ptlp is released;
- * any further use, if any, must be protected against invalidation
- * with MMU notifiers.
- *
- * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
- * should be taken for read.
- *
- * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
- * it is not a good general-purpose API.
- *
- * Return: zero on success, -ve otherwise.
- */
-int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
-{
- return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
-}
EXPORT_SYMBOL_GPL(follow_pte);
/**