summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile1
-rw-r--r--mm/internal.h19
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memory.c21
-rw-r--r--mm/pt_reclaim.c71
7 files changed, 132 insertions, 3 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e5ef71b8629..9372bc058b43 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void);
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ bool reclaim_pt; /* Need reclaim page tables? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..7949ab121070 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1301,6 +1301,21 @@ config ARCH_HAS_USER_SHADOW_STACK
The architecture has hardware support for userspace shadow call
stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+config ARCH_SUPPORTS_PT_RECLAIM
+ def_bool n
+
+config PT_RECLAIM
+ bool "reclaim empty user page table pages"
+ default y
+ depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
+ select MMU_GATHER_RCU_TABLE_FREE
+ help
+ Try to reclaim empty user page table pages in paths other than munmap
+ and exit_mmap path.
+
+ Note: now only empty user PTE page table pages will be reclaimed.
+
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index dba52bb0da8a..850386a67b3e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
+obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
diff --git a/mm/internal.h b/mm/internal.h
index 02890b29da5f..b438d35045de 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1541,4 +1541,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+/* pt_reclaim.c */
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval);
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb);
+
+#ifdef CONFIG_PT_RECLAIM
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details);
+#else
+static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return false;
+}
+#endif /* CONFIG_PT_RECLAIM */
+
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index 0ceae57da7da..49f3a75046f6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single(vma, start, end - start, &details);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index d4d5bd7046e7..560520e20ead 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1710,12 +1710,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct zap_details *details)
{
bool force_flush = false, force_break = false;
- bool any_skipped = false;
struct mm_struct *mm = tlb->mm;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = false;
int nr;
retry:
@@ -1728,17 +1731,24 @@ retry:
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
+ bool any_skipped = false;
+
if (need_resched())
break;
nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
&force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
if (unlikely(force_break)) {
addr += nr * PAGE_SIZE;
break;
}
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ if (can_reclaim_pt && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1765,6 +1775,13 @@ retry:
goto retry;
}
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
new file mode 100644
index 000000000000..7e9455a18aae
--- /dev/null
+++ b/mm/pt_reclaim.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hugetlb.h>
+#include <asm-generic/tlb.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (!spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get_lockless(pmd);
+ pmd_clear(pmd);
+ spin_unlock(pml);
+
+ return true;
+}
+
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval)
+{
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+}
+
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb)
+{
+ pmd_t pmdval;
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /* Check if it is empty PTE page */
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ free_pte(mm, addr, tlb, pmdval);
+
+ return;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+}