summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c97
1 files changed, 59 insertions, 38 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3703a56571c1..90acfea40c13 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
+/*
+ * See pmd_trans_unstable() for how the result may change out from
+ * underneath us, even if we hold mmap_lock in read.
+ */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -857,7 +861,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
if (!*pmd)
return SCAN_PMD_NULL;
- pmde = pmd_read_atomic(*pmd);
+ pmde = pmdp_get_lockless(*pmd);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* See comments in pmd_none_or_trans_huge_or_clear_bad() */
@@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
+ if (!pmd_present(pmde))
+ return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
+ if (pmd_devmap(pmde))
+ return SCAN_PMD_NULL;
if (pmd_bad(pmde))
return SCAN_PMD_NULL;
return SCAN_SUCCEED;
@@ -1238,15 +1246,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
/*
* Check if the page has any GUP (or other external) pins.
*
- * Here the check is racy it may see total_mapcount > refcount
- * in some cases.
- * For example, one process with one forked child process.
- * The parent has the PMD split due to MADV_DONTNEED, then
- * the child is trying unmap the whole PMD, but khugepaged
- * may be scanning the parent between the child has
- * PageDoubleMap flag cleared and dec the mapcount. So
- * khugepaged may see total_mapcount > refcount.
- *
+ * Here the check may be racy:
+ * it may see total_mapcount > refcount in some cases?
* But such case is ephemeral we could always retry collapse
* later. However it may report false positive if the page
* has excessive GUP pins (i.e. 512). Anyway the same check
@@ -1467,14 +1468,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
- /*
- * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
- * that got written to. Without this, we'd have to also lock the
- * anon_vma if one exists.
- */
- if (vma->anon_vma)
- return SCAN_VMA_CHECK;
-
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1574,8 +1567,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
}
/* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd:
@@ -1651,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking
* the anon_vma.
*/
- if (vma->anon_vma) {
+ if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON;
goto next;
}
@@ -1680,6 +1679,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
/*
+ * Re-check whether we have an ->anon_vma, because
+ * collapse_and_free_pmd() requires that either no
+ * ->anon_vma exists or the anon_vma is locked.
+ * We already checked ->anon_vma above, but that check
+ * is racy because ->anon_vma can be populated under the
+ * mmap lock in read mode.
+ */
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto unlock_next;
+ }
+ /*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
* markers installed. Skip it only, so the rest mm/vma
@@ -1751,12 +1762,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
{
struct address_space *mapping = file->f_mapping;
struct page *hpage;
- pgoff_t index, end = start + HPAGE_PMD_NR;
+ pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
- int nr;
+ int nr = 0;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1796,6 +1807,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
xas_set(&xas, start);
for (index = start; index < end; index++) {
struct page *page = xas_next(&xas);
+ struct folio *folio;
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
@@ -1822,8 +1834,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
if (xa_is_value(page) || !PageUptodate(page)) {
- struct folio *folio;
-
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_get_folio(mapping->host, index,
@@ -1911,13 +1921,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (page_mapping(page) != mapping) {
+ folio = page_folio(page);
+
+ if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
}
- if (!is_shmem && (PageDirty(page) ||
- PageWriteback(page))) {
+ if (!is_shmem && (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
@@ -1927,20 +1939,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto out_unlock;
}
- if (isolate_lru_page(page)) {
+ if (folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL)) {
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
- putback_lru_page(page);
+ folio_putback_lru(folio);
goto out_unlock;
}
- if (page_mapped(page))
- try_to_unmap(page_folio(page),
+ if (folio_mapped(folio))
+ try_to_unmap(folio,
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
@@ -2019,6 +2031,7 @@ xa_unlocked:
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
+ struct folio *folio;
/*
* Replacing old pages with new one has succeeded, now we
@@ -2046,11 +2059,13 @@ xa_unlocked:
index++;
}
- SetPageUptodate(hpage);
- page_ref_add(hpage, HPAGE_PMD_NR - 1);
+ folio = page_folio(hpage);
+ folio_mark_uptodate(folio);
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+
if (is_shmem)
- set_page_dirty(hpage);
- lru_cache_add(hpage);
+ folio_mark_dirty(folio);
+ folio_add_lru(folio);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -2108,7 +2123,8 @@ out:
mem_cgroup_uncharge(page_folio(hpage));
put_page(hpage);
}
- /* TODO: tracepoints */
+
+ trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
return result;
}
@@ -2577,6 +2593,11 @@ void khugepaged_min_free_kbytes_update(void)
mutex_unlock(&khugepaged_mutex);
}
+bool current_is_khugepaged(void)
+{
+ return kthread_func(current) == khugepaged;
+}
+
static int madvise_collapse_errno(enum scan_result r)
{
/*
@@ -2646,7 +2667,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));