summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c150
1 files changed, 83 insertions, 67 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cc945c6ab3bd..6b40bdfd224c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = {
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
+ vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
@@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm)
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_pmd_enabled()) {
@@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -696,13 +683,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
@@ -713,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
@@ -735,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -954,12 +952,18 @@ static inline int check_pmd_state(pmd_t *pmd)
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+
+ /*
+ * The folio may be under migration when khugepaged is trying to
+ * collapse it. Migration success or failure will eventually end
+ * up with a present PMD mapping a folio again.
+ */
+ if (is_pmd_migration_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
- if (pmd_devmap(pmde))
- return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1168,11 +1172,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
+ vma_start_write(vma);
result = check_pmd_still_valid(mm, address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
- vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1239,7 +1243,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1402,7 +1406,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1435,7 +1439,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1464,10 +1468,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct vm_fault vmf = {
.vma = vma,
@@ -1476,13 +1479,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
.pmd = pmdp,
};
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1501,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1623,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1641,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
@@ -1689,14 +1700,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
unlock:
if (start_pte)
@@ -2295,6 +2306,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2305,23 +2327,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2333,6 +2359,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2354,14 +2381,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2437,7 +2456,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2736,8 +2755,8 @@ static int madvise_collapse_errno(enum scan_result r)
}
}
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool *lock_dropped)
{
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
@@ -2748,8 +2767,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- *prev = vma;
-
if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
return -EINVAL;
@@ -2783,7 +2800,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2797,7 +2814,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
&mmap_locked, cc);
}
if (!mmap_locked)
- *prev = NULL; /* Tell caller we dropped mmap_lock */
+ *lock_dropped = true;
handle_result:
switch (result) {
@@ -2807,7 +2824,6 @@ handle_result:
break;
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
- BUG_ON(*prev);
mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);