summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Song <v-songbaohua@oppo.com>2025-02-14 12:30:14 +0300
committerAndrew Morton <akpm@linux-foundation.org>2025-03-17 08:06:16 +0300
commit354dffd29575cdf13154e8fb787322354aa9efc4 (patch)
tree0c1e31e1f11dff752f321d88c884a2c036f7773c
parent2f4ab3ac10e1476abb6ed55f0b5f176cf635e776 (diff)
downloadlinux-354dffd29575cdf13154e8fb787322354aa9efc4.tar.xz
mm: support batched unmap for lazyfree large folios during reclamation
Currently, the PTEs and rmap of a large folio are removed one at a time. This is not only slow but also causes the large folio to be unnecessarily added to deferred_split, which can lead to races between the deferred_split shrinker callback and memory reclamation. This patch releases all PTEs and rmap entries in a batch. Currently, it only handles lazyfree large folios. The below microbench tries to reclaim 128MB lazyfree large folios whose sizes are 64KiB: #include <stdio.h> #include <sys/mman.h> #include <string.h> #include <time.h> #define SIZE 128*1024*1024 // 128 MB unsigned long read_split_deferred() { FILE *file = fopen("/sys/kernel/mm/transparent_hugepage" "/hugepages-64kB/stats/split_deferred", "r"); if (!file) { perror("Error opening file"); return 0; } unsigned long value; if (fscanf(file, "%lu", &value) != 1) { perror("Error reading value"); fclose(file); return 0; } fclose(file); return value; } int main(int argc, char *argv[]) { while(1) { volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset((void *)p, 1, SIZE); madvise((void *)p, SIZE, MADV_FREE); clock_t start_time = clock(); unsigned long start_split = read_split_deferred(); madvise((void *)p, SIZE, MADV_PAGEOUT); clock_t end_time = clock(); unsigned long end_split = read_split_deferred(); double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC; printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n", elapsed_time, end_split - start_split); munmap((void *)p, SIZE); } return 0; } w/o patch: ~ # ./a.out Time taken by reclamation: 0.177418 seconds, split_deferred: 2048 Time taken by reclamation: 0.178348 seconds, split_deferred: 2048 Time taken by reclamation: 0.174525 seconds, split_deferred: 2048 Time taken by reclamation: 0.171620 seconds, split_deferred: 2048 Time taken by reclamation: 0.172241 seconds, split_deferred: 2048 Time taken by reclamation: 0.174003 seconds, split_deferred: 2048 Time taken by reclamation: 0.171058 seconds, split_deferred: 2048 Time taken by reclamation: 0.171993 seconds, split_deferred: 2048 Time taken by reclamation: 0.169829 seconds, split_deferred: 2048 Time taken by reclamation: 0.172895 seconds, split_deferred: 2048 Time taken by reclamation: 0.176063 seconds, split_deferred: 2048 Time taken by reclamation: 0.172568 seconds, split_deferred: 2048 Time taken by reclamation: 0.171185 seconds, split_deferred: 2048 Time taken by reclamation: 0.170632 seconds, split_deferred: 2048 Time taken by reclamation: 0.170208 seconds, split_deferred: 2048 Time taken by reclamation: 0.174192 seconds, split_deferred: 2048 ... w/ patch: ~ # ./a.out Time taken by reclamation: 0.074231 seconds, split_deferred: 0 Time taken by reclamation: 0.071026 seconds, split_deferred: 0 Time taken by reclamation: 0.072029 seconds, split_deferred: 0 Time taken by reclamation: 0.071873 seconds, split_deferred: 0 Time taken by reclamation: 0.073573 seconds, split_deferred: 0 Time taken by reclamation: 0.071906 seconds, split_deferred: 0 Time taken by reclamation: 0.073604 seconds, split_deferred: 0 Time taken by reclamation: 0.075903 seconds, split_deferred: 0 Time taken by reclamation: 0.073191 seconds, split_deferred: 0 Time taken by reclamation: 0.071228 seconds, split_deferred: 0 Time taken by reclamation: 0.071391 seconds, split_deferred: 0 Time taken by reclamation: 0.071468 seconds, split_deferred: 0 Time taken by reclamation: 0.071896 seconds, split_deferred: 0 Time taken by reclamation: 0.072508 seconds, split_deferred: 0 Time taken by reclamation: 0.071884 seconds, split_deferred: 0 Time taken by reclamation: 0.072433 seconds, split_deferred: 0 Time taken by reclamation: 0.071939 seconds, split_deferred: 0 ... Link: https://lkml.kernel.org/r/20250214093015.51024-4-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Cc: Albert Ou <aou@eecs.berkeley.edu> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chis Li <chrisl@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Gavin Shan <gshan@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Kairui Song <kasong@tencent.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mauricio Faria de Oliveira <mfo@canonical.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Paul Walmsley <paul.walmsley@sifive.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Shaoqin Huang <shahuang@redhat.com> Cc: Tangquan Zheng <zhengtangquan@oppo.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yicong Yang <yangyicong@hisilicon.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--mm/rmap.c72
1 files changed, 50 insertions, 22 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 765e541ac9be..7a93a7cd2c64 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1722,6 +1722,25 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+ struct folio *folio, pte_t *ptep)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = folio_nr_pages(folio);
+ pte_t pte = ptep_get(ptep);
+
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return false;
+ if (pte_unused(pte))
+ return false;
+ if (pte_pfn(pte) != folio_pfn(folio))
+ return false;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ NULL, NULL) == max_nr;
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1735,6 +1754,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
struct page *subpage;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1874,23 +1894,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+ can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+ nr_pages = folio_nr_pages(folio);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else {
@@ -1968,7 +1991,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* redirtied either using the page table or a previously
* obtained GUP reference.
*/
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
goto walk_abort;
} else if (ref_count != 1 + map_count) {
@@ -1981,10 +2004,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* We'll come back here later and detect if the folio was
* dirtied when the additional reference is gone.
*/
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
goto walk_abort;
}
- dec_mm_counter(mm, MM_ANONPAGES);
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
goto discard;
}
@@ -2049,13 +2072,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ folio_ref_sub(folio, nr_pages - 1);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ /* We have already batched the entire folio */
+ if (nr_pages > 1)
+ goto walk_done;
continue;
walk_abort:
ret = false;