summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c315
1 files changed, 209 insertions, 106 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 75c2dfd04f72..b4d3d4893267 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,6 +76,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -1436,7 +1437,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1466,34 +1467,42 @@ static inline bool zap_drop_markers(struct zap_details *details)
/*
* This function makes sure that we'll replace the none pte with an uffd-wp
* swap special pte marker when necessary. Must be with the pgtable lock held.
+ *
+ * Returns true if uffd-wp ptes was installed, false otherwise.
*/
-static inline void
+static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
+ bool was_installed = false;
+
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
- return;
+ return false;
if (zap_drop_markers(details))
- return;
+ return false;
for (;;) {
/* the PFN in the PTE is irrelevant. */
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
+ was_installed = true;
if (--nr == 0)
break;
pte++;
addr += PAGE_SIZE;
}
+#endif
+ return was_installed;
}
static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
unsigned long addr, struct zap_details *details, int *rss,
- bool *force_flush, bool *force_break)
+ bool *force_flush, bool *force_break, bool *any_skipped)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
@@ -1519,8 +1528,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
- ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
+ nr, details, ptent);
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
@@ -1544,7 +1553,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ bool *force_break, bool *any_skipped)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
@@ -1559,15 +1568,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
if (userfaultfd_pte_wp(vma, ptent))
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
- details, ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
+ pte, 1, details, ptent);
ksm_might_unmap_zero_page(mm, ptent);
return 1;
}
folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
+ if (unlikely(!should_zap_folio(details, folio))) {
+ *any_skipped = true;
return 1;
+ }
/*
* Make sure that the common "small folio" case is as fast as possible
@@ -1579,14 +1590,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
- force_break);
+ force_break, any_skipped);
return nr;
}
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
- details, rss, force_flush, force_break);
+ details, rss, force_flush, force_break, any_skipped);
return 1;
}
+static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *any_skipped)
+{
+ swp_entry_t entry;
+ int nr = 1;
+
+ *any_skipped = true;
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(page);
+
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ rss[mm_counter(folio)]--;
+ if (is_device_private_entry(entry))
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ return 1;
+
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ struct folio *folio = pfn_swap_entry_folio(entry);
+
+ if (!should_zap_folio(details, folio))
+ return 1;
+ rss[mm_counter(folio)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
+ return 1;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
+ return 1;
+ } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ if (!should_zap_cows(details))
+ return 1;
+ } else {
+ /* We should have covered all the swap entry types */
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+ clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+
+ return nr;
+}
+
+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break,
+ bool *any_skipped)
+{
+ pte_t ptent = ptep_get(pte);
+ int max_nr = (end - addr) / PAGE_SIZE;
+ int nr = 0;
+
+ /* Skip all consecutive none ptes */
+ if (pte_none(ptent)) {
+ for (nr = 1; nr < max_nr; nr++) {
+ ptent = ptep_get(pte + nr);
+ if (!pte_none(ptent))
+ break;
+ }
+ max_nr -= nr;
+ if (!max_nr)
+ return nr;
+ pte += nr;
+ addr += nr * PAGE_SIZE;
+ }
+
+ if (pte_present(ptent))
+ nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, force_flush, force_break,
+ any_skipped);
+ else
+ nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, any_skipped);
+
+ return nr;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1598,9 +1716,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = true;
int nr;
+retry:
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1610,90 +1732,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = ptep_get(pte);
- struct folio *folio;
- struct page *page;
- int max_nr;
-
- nr = 1;
- if (pte_none(ptent))
- continue;
+ bool any_skipped = false;
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
-
- if (pte_present(ptent)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
- addr, details, rss, &force_flush,
- &force_break);
- if (unlikely(force_break)) {
- addr += nr * PAGE_SIZE;
- break;
- }
- continue;
}
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
- folio_put(folio);
- } else if (!non_swap_entry(entry)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = swap_pte_batch(pte, max_nr, ptent);
- /* Genuine swap entries, hence a private anon pages */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
- if (!should_zap_folio(details, folio))
- continue;
- rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /*
- * For anon: always drop the marker; for file: only
- * drop the marker if explicitly requested.
- */
- if (!vma_is_anonymous(vma) &&
- !zap_drop_markers(details))
- continue;
- } else if (is_guard_swp_entry(entry)) {
- /*
- * Ordinary zapping should not remove guard PTE
- * markers. Only do so if we should remove PTE markers
- * in general.
- */
- if (!zap_drop_markers(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
- WARN_ON_ONCE(1);
+ nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
+ &force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
+ break;
}
- clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1713,6 +1780,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush)
tlb_flush_mmu(tlb);
+ if (addr != end) {
+ cond_resched();
+ force_flush = false;
+ force_break = false;
+ goto retry;
+ }
+
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
@@ -1935,7 +2016,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
@@ -3013,7 +3093,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
@@ -4189,8 +4268,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
return folio;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
}
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -4733,12 +4814,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
folio_throttle_swaprate(folio, gfp);
/*
* When a folio is not zeroed during allocation
- * (__GFP_ZERO not used), folio_zero_user() is used
- * to make sure that the page corresponding to the
- * faulting address will be hot in the cache after
- * zeroing.
+ * (__GFP_ZERO not used) or user folios require special
+ * handling, folio_zero_user() is used to make sure
+ * that the page corresponding to the faulting address
+ * will be hot in the cache after zeroing.
*/
- if (!alloc_zeroed())
+ if (user_alloc_needs_zeroing())
folio_zero_user(folio, vmf->address);
return folio;
}
@@ -5625,7 +5706,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ignore_writable = true;
/* Migrate to the requested node */
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
task_numa_fault(last_cpupid, nid, nr_pages, flags);
@@ -5662,8 +5743,17 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
+ /*
+ * Currently we just emit PAGE_SIZE for our fault events, so don't allow
+ * a huge fault if we have a pre content watch on this file. This would
+ * be trivial to support, but there would need to be tests to ensure
+ * this works properly and those don't exist currently.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5687,6 +5777,9 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5709,6 +5802,9 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5726,6 +5822,9 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ /* See comment in create_huge_pmd. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
+ goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -6069,7 +6168,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * By the time we get here, we already hold either the VMA lock or the
+ * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
@@ -6185,7 +6285,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r
/*
* Helper for page fault handling.
*
- * This is kind of equivalend to "mmap_read_lock()" followed
+ * This is kind of equivalent to "mmap_read_lock()" followed
* by "find_extend_vma()", except it's a lot more careful about
* the locking (and will drop the lock on failure).
*
@@ -6815,9 +6915,10 @@ static inline int process_huge_page(
return 0;
}
-static void clear_gigantic_page(struct folio *folio, unsigned long addr,
+static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
unsigned int nr_pages)
{
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
int i;
might_sleep();
@@ -6851,13 +6952,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
}
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
- unsigned long addr,
+ unsigned long addr_hint,
struct vm_area_struct *vma,
unsigned int nr_pages)
{
- int i;
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
struct page *dst_page;
struct page *src_page;
+ int i;
for (i = 0; i < nr_pages; i++) {
dst_page = folio_page(dst, i);
@@ -6959,7 +7061,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
+ if (ptdesc->ptl)
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif