summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c12
-rw-r--r--mm/damon/core.c1
-rw-r--r--mm/damon/ops-common.c2
-rw-r--r--mm/damon/paddr.c28
-rw-r--r--mm/damon/sysfs-schemes.c1
-rw-r--r--mm/debug_vm_pgtable.c9
-rw-r--r--mm/filemap.c38
-rw-r--r--mm/gup.c7
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c25
-rw-r--r--mm/hugetlb.c119
-rw-r--r--mm/internal.h32
-rw-r--r--mm/kasan/kasan_test_rust.rs3
-rw-r--r--mm/kasan/report.c15
-rw-r--r--mm/kfence/core.c2
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmemleak.c14
-rw-r--r--mm/kmsan/hooks.c1
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memblock.c21
-rw-r--r--mm/memcontrol.c14
-rw-r--r--mm/memory-failure.c86
-rw-r--r--mm/memory.c38
-rw-r--r--mm/memory_hotplug.c27
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/migrate_device.c13
-rw-r--r--mm/mremap.c42
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c216
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_vma_mapped.c13
-rw-r--r--mm/ptdump.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/secretmem.c10
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slub.c42
-rw-r--r--mm/swapfile.c63
-rw-r--r--mm/userfaultfd.c177
-rw-r--r--mm/vma.c70
-rw-r--r--mm/vma.h9
-rw-r--r--mm/vma_internal.h1
-rw-r--r--mm/vmalloc.c123
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/zsmalloc.c3
-rw-r--r--mm/zswap.c118
47 files changed, 998 insertions, 490 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index a2b16b08cbbf..eb5474dea04d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -630,7 +630,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (blockpfn + (1UL << order) <= end_pfn) {
+ if ((order <= MAX_PAGE_ORDER) &&
+ (blockpfn + (1UL << order) <= end_pfn)) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
@@ -979,13 +980,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -1009,8 +1010,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
@@ -3163,6 +3164,7 @@ static int kcompactd(void *p)
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3219,6 +3221,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 54f4dd8d549f..9689f5425238 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -837,6 +837,7 @@ static int damos_commit(struct damos *dst, struct damos *src)
return err;
dst->wmarks = src->wmarks;
+ dst->target_nid = src->target_nid;
err = damos_commit_filters(dst, src);
return err;
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index d25d99cb5f2b..d511be201c4c 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -24,7 +24,7 @@ struct folio *damon_get_folio(unsigned long pfn)
struct page *page = pfn_to_online_page(pfn);
struct folio *folio;
- if (!page || PageTail(page))
+ if (!page)
return NULL;
folio = page_folio(page);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a9ff35341d65..4120a73f4933 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -264,11 +264,14 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
damos_add_filter(s, filter);
}
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
struct folio *folio = damon_get_folio(PHYS_PFN(addr));
- if (!folio)
+ if (!folio) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -282,6 +285,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
else
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
if (install_young_filter)
@@ -296,11 +300,14 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
{
unsigned long addr, applied = 0;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
struct folio *folio = damon_get_folio(PHYS_PFN(addr));
- if (!folio)
+ if (!folio) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -311,6 +318,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
folio_deactivate(folio);
applied += folio_nr_pages(folio);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
return applied * PAGE_SIZE;
@@ -423,6 +431,10 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
if (list_empty(folio_list))
return nr_migrated;
+ if (target_nid < 0 || target_nid >= MAX_NUMNODES ||
+ !node_state(target_nid, N_MEMORY))
+ return nr_migrated;
+
noreclaim_flag = memalloc_noreclaim_save();
nid = folio_nid(lru_to_folio(folio_list));
@@ -454,11 +466,14 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
unsigned long addr, applied;
LIST_HEAD(folio_list);
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
struct folio *folio = damon_get_folio(PHYS_PFN(addr));
- if (!folio)
+ if (!folio) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -467,6 +482,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
goto put_folio;
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index b095457380b5..d9e01648db70 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -423,6 +423,7 @@ static ssize_t memcg_path_store(struct kobject *kobj,
return -ENOMEM;
strscpy(path, buf, count + 1);
+ kfree(filter->memcg_path);
filter->memcg_path = path;
return count;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..80cc409ba78a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1049,29 +1049,34 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free page table entries */
if (args->start_ptep) {
+ pmd_clear(args->pmdp);
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
}
if (args->start_pmdp) {
+ pud_clear(args->pudp);
pmd_free(args->mm, args->start_pmdp);
mm_dec_nr_pmds(args->mm);
}
if (args->start_pudp) {
+ p4d_clear(args->p4dp);
pud_free(args->mm, args->start_pudp);
mm_dec_nr_puds(args->mm);
}
- if (args->start_p4dp)
+ if (args->start_p4dp) {
+ pgd_clear(args->pgdp);
p4d_free(args->mm, args->start_p4dp);
+ }
/* Free vma and mm struct */
if (args->vma)
vm_area_free(args->vma);
if (args->mm)
- mmdrop(args->mm);
+ mmput(args->mm);
}
static struct page * __init
diff --git a/mm/filemap.c b/mm/filemap.c
index 56fa431c52af..ec69fadf014c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1750,8 +1750,9 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
+ unsigned long nr = max_scan;
- while (max_scan--) {
+ while (nr--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
return xas.xa_index;
@@ -1966,8 +1967,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -2211,6 +2223,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
@@ -3004,7 +3017,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
- start = (start + bsz) & ~(bsz - 1);
+ start = (start + bsz) & ~((u64)bsz - 1);
offset += bsz;
} while (offset < folio_size(folio));
unlock:
@@ -4384,6 +4397,20 @@ resched:
}
/*
+ * See mincore: reveal pagecache information only for files
+ * that the calling process has write access to, or could (if
+ * tried) open for writing.
+ */
+static inline bool can_do_cachestat(struct file *f)
+{
+ if (f->f_mode & FMODE_WRITE)
+ return true;
+ if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
+ return true;
+ return file_permission(f, MAY_WRITE) == 0;
+}
+
+/*
* The cachestat(2) system call.
*
* cachestat() returns the page cache statistics of a file in the
@@ -4442,6 +4469,11 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
return -EOPNOTSUPP;
}
+ if (!can_do_cachestat(fd_file(f))) {
+ fdput(f);
+ return -EPERM;
+ }
+
if (flags != 0) {
fdput(f);
return -EINVAL;
diff --git a/mm/gup.c b/mm/gup.c
index 7053f8114e01..e323843cc5dd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1283,6 +1283,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -2210,8 +2213,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
} while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > start - (unsigned long)uaddr)
+ return size - (start - (unsigned long)uaddr);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
diff --git a/mm/hmm.c b/mm/hmm.c
index 7e0229ae4a5a..a67776aeb019 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -173,6 +173,7 @@ static inline unsigned long hmm_pfn_flags_order(unsigned long order)
return order << HMM_PFN_ORDER_SHIFT;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd)
{
@@ -183,7 +184,6 @@ static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[],
pmd_t pmd)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7e0f72cd9fd4..f94a9d413585 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2132,6 +2132,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2170,6 +2180,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
@@ -2867,6 +2879,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze, struct folio *folio)
{
+ bool pmd_migration = is_pmd_migration_entry(*pmd);
+
VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
@@ -2877,9 +2891,12 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
* require a folio to check the PMD against. Otherwise, there
* is a risk of replacing the wrong folio.
*/
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) {
+ /*
+ * Do not apply pmd_folio() to a migration entry; and folio lock
+ * guarantees that it must be of the wrong folio anyway.
+ */
+ if (folio && (pmd_migration || folio != pmd_folio(*pmd)))
return;
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3212,7 +3229,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
- folio_put(tail);
+ folio_put_refs(tail, folio_nr_pages(tail));
} else if (!PageAnon(page)) {
__xa_store(&folio->mapping->i_pages, head[i].index,
head + i, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2fa87b9ecec6..9c6a4e855481 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -87,7 +87,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -1394,8 +1394,7 @@ static unsigned long available_huge_pages(struct hstate *h)
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve,
- long chg)
+ unsigned long address, long chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
@@ -1411,10 +1410,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
goto err;
- /* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && !available_huge_pages(h))
- goto err;
-
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
@@ -1430,7 +1425,7 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
- if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ if (folio && vma_has_reserves(vma, chg)) {
folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
@@ -2960,6 +2955,14 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
return ret;
}
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
@@ -3006,17 +3009,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0)
goto out_end_reservation;
-
- /*
- * Even though there was no reservation in the region/reserve
- * map, there could be reservations associated with the
- * subpool that can be used. This would be indicated if the
- * return value of hugepage_subpool_get_pages() is zero.
- * However, if avoid_reserve is specified we still avoid even
- * the subpool reservations.
- */
- if (avoid_reserve)
- gbl_chg = 1;
}
/* If this allocation is not consuming a reservation, charge it now.
@@ -3039,7 +3031,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
if (!folio) {
spin_unlock_irq(&hugetlb_lock);
folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
@@ -3287,7 +3279,7 @@ static void __init gather_bootmem_prealloc(void)
.thread_fn = gather_bootmem_prealloc_parallel,
.fn_arg = NULL,
.start = 0,
- .size = num_node_state(N_MEMORY),
+ .size = nr_node_ids,
.align = 1,
.min_chunk = 1,
.max_threads = num_node_state(N_MEMORY),
@@ -4871,7 +4863,7 @@ static struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
@@ -5079,26 +5071,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -5395,6 +5401,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5410,8 +5417,19 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
@@ -5574,7 +5592,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -7248,6 +7266,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
@@ -7480,9 +7505,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7506,8 +7538,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7517,8 +7553,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7533,7 +7571,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
#ifdef CONFIG_CMA
diff --git a/mm/internal.h b/mm/internal.h
index 9bb098e78f15..9e0577413087 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -204,11 +204,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
bool *any_writable, bool *any_young, bool *any_dirty)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
bool writable, young, dirty;
- int nr;
+ int nr, cur_nr;
if (any_writable)
*any_writable = false;
@@ -221,11 +219,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -238,14 +240,6 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (!pte_same(pte, expected_pte))
break;
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
- break;
-
if (any_writable)
*any_writable |= writable;
if (any_young)
@@ -253,12 +247,13 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (any_dirty)
*any_dirty |= dirty;
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ return min(nr, max_nr);
}
/**
@@ -1101,7 +1096,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1123,8 +1118,9 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
diff --git a/mm/kasan/kasan_test_rust.rs b/mm/kasan/kasan_test_rust.rs
index caa7175964ef..5b34edf30e72 100644
--- a/mm/kasan/kasan_test_rust.rs
+++ b/mm/kasan/kasan_test_rust.rs
@@ -11,11 +11,12 @@ use kernel::prelude::*;
/// drop the vector, and touch it.
#[no_mangle]
pub extern "C" fn kasan_test_rust_uaf() -> u8 {
- let mut v: Vec<u8> = Vec::new();
+ let mut v: KVec<u8> = KVec::new();
for _ in 0..4096 {
v.push(0x42, GFP_KERNEL).unwrap();
}
let ptr: *mut u8 = addr_of_mut!(v[2048]);
drop(v);
+ // SAFETY: Incorrect, on purpose.
unsafe { *ptr }
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c7c0083203cb..f17265410156 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -398,17 +398,10 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
-
- if (va) {
- pr_err("The buggy address belongs to the virtual mapping at\n"
- " [%px, %px) created by:\n"
- " %pS\n",
- va->addr, va->addr + va->size, va->caller);
- pr_err("\n");
-
- page = vmalloc_to_page(addr);
- }
+ pr_err("The buggy address belongs to a");
+ if (!vmalloc_dump_obj(addr))
+ pr_cont(" vmalloc virtual mapping\n");
+ page = vmalloc_to_page(addr);
}
if (page) {
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 67fc321db79b..102048821c22 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -21,6 +21,7 @@
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
+#include <linux/nodemask.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h>
#include <linux/random.h>
@@ -1084,6 +1085,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
+ ((flags & __GFP_THISNODE) && num_online_nodes() > 1) ||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b538c3d48386..abd5764e4864 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2404,7 +2404,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2750,7 +2750,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 74f5f4c51ab8..91894fc54c64 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -465,6 +465,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
+ bool warn = false;
/* try the slab allocator first */
if (object_cache) {
@@ -483,8 +484,10 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
else if (mem_pool_free_count)
object = &mem_pool[--mem_pool_free_count];
else
- pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ warn = true;
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ if (warn)
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
return object;
}
@@ -1071,7 +1074,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
- create_object_percpu((__force unsigned long)ptr, size, 0, gfp);
+ create_object_percpu((__force unsigned long)ptr, size, 1, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1650,7 +1653,7 @@ static void kmemleak_scan(void)
unsigned long phys = object->pointer;
if (PHYS_PFN(phys) < min_low_pfn ||
- PHYS_PFN(phys + object->size) >= max_low_pfn)
+ PHYS_PFN(phys + object->size) > max_low_pfn)
__paint_it(object, KMEMLEAK_BLACK);
}
@@ -2107,6 +2110,7 @@ static const struct file_operations kmemleak_fops = {
static void __kmemleak_do_cleanup(void)
{
struct kmemleak_object *object, *tmp;
+ unsigned int cnt = 0;
/*
* Kmemleak has already been disabled, no need for RCU list traversal
@@ -2115,6 +2119,10 @@ static void __kmemleak_do_cleanup(void)
list_for_each_entry_safe(object, tmp, &object_list, object_list) {
__remove_object(object);
__delete_object(object);
+
+ /* Call cond_resched() once per 64 iterations to avoid soft lockup */
+ if (!(++cnt & 0x3f))
+ cond_resched();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..3df45c25c1f6 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/ksm.c b/mm/ksm.c
index a2e2a521df0a..17e6c16ab81d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3643,10 +3643,10 @@ static ssize_t advisor_mode_show(struct kobject *kobj,
{
const char *output;
- if (ksm_advisor == KSM_ADVISOR_NONE)
- output = "[none] scan-time";
- else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
output = "none [scan-time]";
+ else
+ output = "[none] scan-time";
return sysfs_emit(buf, "%s\n", output);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index ff139e57cca2..2e66a08fd4f4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -495,6 +495,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -728,6 +729,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -920,7 +922,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
diff --git a/mm/memblock.c b/mm/memblock.c
index 095c18b5c430..3d7b0114442c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -456,7 +456,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
- new_array = addr ? __va(addr) : NULL;
+ if (addr) {
+ /* The memory may not have been accepted, yet. */
+ accept_memory(addr, new_alloc_size);
+
+ new_array = __va(addr);
+ } else {
+ new_array = NULL;
+ }
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -2160,11 +2167,14 @@ static void __init memmap_init_reserved_pages(void)
struct memblock_region *region;
phys_addr_t start, end;
int nid;
+ unsigned long max_reserved;
/*
* set nid on all reserved pages and also treat struct
* pages for the NOMAP regions as PageReserved
*/
+repeat:
+ max_reserved = memblock.reserved.max;
for_each_mem_region(region) {
nid = memblock_get_region_node(region);
start = region->base;
@@ -2173,8 +2183,15 @@ static void __init memmap_init_reserved_pages(void)
if (memblock_is_nomap(region))
reserve_bootmem_region(start, end, nid);
- memblock_set_node(start, end, &memblock.reserved, nid);
+ memblock_set_node(start, region->size, &memblock.reserved, nid);
}
+ /*
+ * 'max' is changed means memblock.reserved has been doubled its
+ * array, which may result a new reserved region before current
+ * 'start'. Now we should repeat the procedure to set its node id.
+ */
+ if (max_reserved != memblock.reserved.max)
+ goto repeat;
/*
* initialize struct pages for reserved regions that don't have
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53db98d2c4a1..dcd2d0e15e13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1147,8 +1147,11 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
@@ -1877,9 +1880,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old;
+ unsigned long flags;
stock = &per_cpu(memcg_stock, cpu);
+
+ /* drain_obj_stock requires stock_lock */
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ old = drain_obj_stock(stock);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
drain_stock(stock);
+ obj_cgroup_put(old);
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 96ce31e5a203..8c8d78d6d306 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -845,9 +845,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
#define hwpoison_hugetlb_range NULL
#endif
+static int hwpoison_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ /* We also want to consider pages mapped into VM_PFNMAP. */
+ return 0;
+}
+
static const struct mm_walk_ops hwpoison_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
+ .test_walk = hwpoison_test_walk,
.walk_lock = PGWALK_RDLOCK,
};
@@ -879,12 +887,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
(void *)&priv);
+ /*
+ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
+ * succeeds or fails, then kill the process with SIGBUS.
+ * ret = 0 when poison page is a clean page and it's dropped, no
+ * SIGBUS is needed.
+ */
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
- else
- ret = 0;
mmap_read_unlock(p->mm);
- return ret > 0 ? -EHWPOISON : -EFAULT;
+
+ return ret > 0 ? -EHWPOISON : 0;
}
/*
@@ -1554,11 +1567,39 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+/*
+ * The caller must guarantee the folio isn't large folio, except hugetlb.
+ * try_to_unmap() can't handle it.
+ */
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1570,7 +1611,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1578,6 +1619,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1587,8 +1630,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1611,29 +1652,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1641,9 +1659,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
diff --git a/mm/memory.c b/mm/memory.c
index d322ddfe6791..b6daa0e673a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1356,12 +1356,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next, pfn;
bool is_cow;
int ret;
@@ -1372,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
+ ret = track_pfn_copy(dst_vma, src_vma, &pfn);
if (ret)
return ret;
}
@@ -1413,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1423,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
+ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -2814,11 +2811,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -2957,8 +2954,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -5077,7 +5076,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5116,7 +5119,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5152,9 +5156,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -6711,10 +6715,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 621ae1015106..0a42e9a8caba 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1795,27 +1795,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
- if (WARN_ON(folio_test_lru(folio)))
- folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
if (!folio_try_get(folio))
continue;
if (unlikely(page_folio(page) != folio))
goto put_folio;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ if (WARN_ON(folio_test_lru(folio)))
+ folio_isolate_lru(folio);
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
+
+ goto put_folio;
+ }
+
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
pr_warn("failed to isolate pfn %lx\n",
diff --git a/mm/migrate.c b/mm/migrate.c
index dfa24e41e8f9..25e7438af968 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -526,15 +526,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..5bd888223cc8 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns,
dst = src;
}
+ if (!folio_is_zone_device(dst))
+ folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
folio_unlock(src);
-
- if (folio_is_zone_device(src))
- folio_put(src);
- else
- folio_putback_lru(src);
+ folio_put(src);
if (dst != src) {
folio_unlock(dst);
- if (folio_is_zone_device(dst))
- folio_put(dst);
- else
- folio_putback_lru(dst);
+ folio_put(dst);
}
}
}
diff --git a/mm/mremap.c b/mm/mremap.c
index dee98ff2bbd6..12af89b4342a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
@@ -207,7 +208,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -269,6 +281,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -324,6 +345,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -666,8 +696,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
unsigned long moved_len;
- unsigned long account_start = 0;
- unsigned long account_end = 0;
+ bool account_start = false;
+ bool account_end = false;
unsigned long hiwater_vm;
int err = 0;
bool need_rmap_locks;
@@ -751,9 +781,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vm_flags_clear(vma, VM_ACCOUNT);
if (vma->vm_start < old_addr)
- account_start = vma->vm_start;
+ account_start = true;
if (vma->vm_end > old_addr + old_len)
- account_end = vma->vm_end;
+ account_end = true;
}
/*
@@ -793,7 +823,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
vm_acct_memory(old_len >> PAGE_SHIFT);
- account_start = account_end = 0;
+ account_start = account_end = false;
}
if (vm_flags & VM_LOCKED) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..8aa712afd8ae 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -431,10 +432,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fcd4c1439cb9..bfb3f903bb6d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -543,8 +543,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de65e8b4f75f..752576749db9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -872,9 +872,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -901,10 +899,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
@@ -1907,13 +1903,12 @@ static inline bool boost_watermark(struct zone *zone)
* can claim the whole pageblock for the requested migratetype. If not, we check
* the pageblock for constituent pages; if at least half of the pages are free
* or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * future will be put on the correct free list.
*/
static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+try_to_steal_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ unsigned int alloc_flags)
{
int free_pages, movable_pages, alike_pages;
unsigned long start_pfn;
@@ -1926,7 +1921,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
* highatomic accounting.
*/
if (is_migrate_highatomic(block_type))
- goto single_page;
+ return NULL;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1947,14 +1942,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1987,9 +1978,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- page_del_and_expand(zone, page, order, current_order, block_type);
- return page;
+ return NULL;
}
/*
@@ -2171,17 +2160,15 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
}
/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2212,58 +2199,66 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ if (!can_steal)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_steal_block(zone, page, current_order, order,
+ start_migratetype, alloc_flags);
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool can_steal;
-find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
-
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
-
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ if (fallback_mt == -1)
+ continue;
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
- return page;
+ return NULL;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2282,16 +2277,49 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+
+ return NULL;
}
/*
@@ -2303,13 +2331,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2910,7 +2939,9 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -4243,6 +4274,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4345,6 +4377,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -5991,11 +6031,10 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
@@ -7006,9 +7045,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -7035,11 +7071,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7048,9 +7080,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
}
void accept_page(struct page *page)
@@ -7087,24 +7116,26 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
- return false;
-
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -7123,22 +7154,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7e04047977cf..6989c5ffd474 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -612,6 +612,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae5cc42aa208..585a53f7b06f 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -77,6 +77,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* mapped at the @pvmw->pte
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
* for checking
+ * @pte_nr: the number of small pages described by @pvmw->pte.
*
* page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
@@ -93,7 +94,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
* Otherwise, return false.
*
*/
-static bool check_pte(struct page_vma_mapped_walk *pvmw)
+static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
{
unsigned long pfn;
pte_t ptent = ptep_get(pvmw->pte);
@@ -126,7 +127,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(ptent);
}
- return (pfn - pvmw->pfn) < pvmw->nr_pages;
+ if ((pfn + pte_nr - 1) < pvmw->pfn)
+ return false;
+ if (pfn > (pvmw->pfn + pvmw->nr_pages - 1))
+ return false;
+ return true;
}
/* Returns true if the two ranges overlap. Careful to not overflow. */
@@ -201,7 +206,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return false;
pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
- if (!check_pte(pvmw))
+ if (!check_pte(pvmw, pages_per_huge_page(hstate)))
return not_found(pvmw);
return true;
}
@@ -284,7 +289,7 @@ restart:
goto next_pte;
}
this_pte:
- if (check_pte(pvmw))
+ if (check_pte(pvmw, 1))
return true;
next_pte:
do {
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..3e78bf33da42 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -153,6 +153,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
+ get_online_mems();
mmap_write_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
@@ -160,6 +161,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
range++;
}
mmap_write_unlock(mm);
+ put_online_mems();
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
diff --git a/mm/rmap.c b/mm/rmap.c
index 73d5998677d4..674362de029d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2488,7 +2488,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
* Restrict to anonymous folios for now to avoid potential writeback
* issues.
*/
- if (!folio_test_anon(folio))
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
return false;
rmap_walk(folio, &rwc);
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0..4662f2510ae5 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -195,19 +195,11 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
- int err;
- inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &qname, NULL);
- if (err) {
- file = ERR_PTR(err);
- goto err_free_inode;
- }
-
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
O_RDWR, &secretmem_fops);
if (IS_ERR(file))
diff --git a/mm/shmem.c b/mm/shmem.c
index dd4eb11c84b5..88fd6e2a2dcf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3042,8 +3042,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
if (ret)
return ret;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
@@ -3700,7 +3699,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
- if (!simple_offset_empty(dentry))
+ if (!simple_empty(dentry))
return -ENOTEMPTY;
drop_nlink(d_inode(dentry));
@@ -3757,7 +3756,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
return simple_offset_rename_exchange(old_dir, old_dentry,
new_dir, new_dentry);
- if (!simple_offset_empty(new_dentry))
+ if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
diff --git a/mm/slub.c b/mm/slub.c
index b9447a955f61..dc527b59f5a9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1960,6 +1960,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2030,20 +2035,12 @@ static inline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
-static inline bool need_slab_obj_ext(void)
-{
- if (mem_alloc_profiling_enabled())
- return true;
+#else /* CONFIG_SLAB_OBJ_EXT */
- /*
- * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
- * inside memcg_slab_post_alloc_hook. No other users for now.
- */
- return false;
+static inline void init_slab_obj_exts(struct slab *slab)
+{
}
-#else /* CONFIG_SLAB_OBJ_EXT */
-
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2054,11 +2051,6 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
-static inline bool need_slab_obj_ext(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -2090,7 +2082,7 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
static inline void
alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext()) {
+ if (mem_alloc_profiling_enabled()) {
struct slabobj_ext *obj_exts;
obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
@@ -2568,8 +2560,12 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online() || need_slab_obj_ext())
- free_slab_obj_exts(slab);
+ /*
+ * The slab object extensions should now be freed regardless of
+ * whether mem_alloc_profiling_enabled() or not because profiling
+ * might have been disabled after slab->obj_exts got allocated.
+ */
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -2613,6 +2609,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
@@ -4228,7 +4225,12 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
+
+ if (node == NUMA_NO_NODE)
+ folio = (struct folio *)alloc_pages_noprof(flags, order);
+ else
+ folio = (struct folio *)__alloc_pages_noprof(flags, order, node, NULL);
+
if (folio) {
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0a9071cfe1d..c02493d9c7be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3237,43 +3237,30 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
#define SWAP_CLUSTER_COLS \
max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-static int setup_swap_map_and_extents(struct swap_info_struct *si,
- union swap_header *swap_header,
- unsigned char *swap_map,
- unsigned long maxpages,
- sector_t *span)
+static int setup_swap_map(struct swap_info_struct *si,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages)
{
- unsigned int nr_good_pages;
unsigned long i;
- int nr_extents;
-
- nr_good_pages = maxpages - 1; /* omit header page */
+ swap_map[0] = SWAP_MAP_BAD; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
- nr_good_pages--;
+ si->pages--;
}
}
- if (nr_good_pages) {
- swap_map[0] = SWAP_MAP_BAD;
- si->max = maxpages;
- si->pages = nr_good_pages;
- nr_extents = setup_swap_extents(si, span);
- if (nr_extents < 0)
- return nr_extents;
- nr_good_pages = si->pages;
- }
- if (!nr_good_pages) {
+ if (!si->pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
- return nr_extents;
+ return 0;
}
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
@@ -3318,13 +3305,17 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* Mark unusable pages as unavailable. The clusters aren't
* marked free yet, so no list operations are involved yet.
*
- * See setup_swap_map_and_extents(): header page, bad pages,
+ * See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
inc_cluster_info_page(si, cluster_info, 0);
- for (i = 0; i < swap_header->info.nr_badpages; i++)
- inc_cluster_info_page(si, cluster_info,
- swap_header->info.badpages[i]);
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+
+ if (page_nr >= maxpages)
+ continue;
+ inc_cluster_info_page(si, cluster_info, page_nr);
+ }
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
inc_cluster_info_page(si, cluster_info, i);
@@ -3456,6 +3447,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
+ si->max = maxpages;
+ si->pages = maxpages - 1;
+ nr_extents = setup_swap_extents(si, &span);
+ if (nr_extents < 0) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+ if (si->pages != si->max - 1) {
+ pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ maxpages = si->max;
+
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
@@ -3467,12 +3473,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
- maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
+ error = setup_swap_map(si, swap_header, swap_map, maxpages);
+ if (error)
goto bad_swap_unlock_inode;
- }
/*
* Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ce13c4062647..8b0f2fbd6a75 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -1058,8 +1059,13 @@ static int move_present_pte(struct mm_struct *mm,
src_folio->index = linear_page_index(dst_vma, dst_addr);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1067,14 +1073,22 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
double_pt_lock(dst_ptl, src_ptl);
@@ -1084,7 +1098,39 @@ static int move_swap_pte(struct mm_struct *mm,
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1130,6 +1176,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1215,6 +1262,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1234,14 +1282,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1257,8 +1319,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1283,8 +1345,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1297,11 +1359,13 @@ retry:
orig_dst_pte, orig_src_pte,
dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1310,10 +1374,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr,
- dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte,
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
@@ -1330,6 +1437,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
@@ -1691,13 +1800,16 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
- struct folio *folio = pmd_folio(*src_pmd);
-
- if (!folio || (!is_huge_zero_folio(folio) &&
- !PageAnonExclusive(&folio->page))) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
+ /* Can be a migration entry */
+ if (pmd_present(*src_pmd)) {
+ struct folio *folio = pmd_folio(*src_pmd);
+
+ if (!is_huge_zero_folio(folio) &&
+ !PageAnonExclusive(&folio->page)) {
+ spin_unlock(ptl);
+ err = -EBUSY;
+ break;
+ }
}
spin_unlock(ptl);
@@ -1801,6 +1913,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1808,7 +1928,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1859,7 +1979,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
diff --git a/mm/vma.c b/mm/vma.c
index 7621384d64cf..140f7017bb63 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -416,7 +416,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -829,9 +836,6 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
err = dup_anon_vma(next, vma, &anon_dup);
}
- if (err)
- goto abort;
-
/*
* In nearly all cases, we expand vmg->vma. There is one exception -
* merge_right where we partially span the VMA. In this case we shrink
@@ -839,16 +843,11 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
*/
expanded = !merge_right || merge_will_delete_vma;
- if (commit_merge(vmg, adjust,
- merge_will_delete_vma ? vma : NULL,
- merge_will_delete_next ? next : NULL,
- adj_start, expanded)) {
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
+ if (err || commit_merge(vmg, adjust,
+ merge_will_delete_vma ? vma : NULL,
+ merge_will_delete_next ? next : NULL,
+ adj_start, expanded))
+ goto abort;
res = merge_left ? prev : next;
khugepaged_enter_vma(res, vmg->flags);
@@ -859,7 +858,18 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -1033,9 +1043,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1417,24 +1433,35 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
@@ -1492,12 +1519,15 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
vmg.flags = new_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
diff --git a/mm/vma.h b/mm/vma.h
index d58068c0ff2e..729fe3741e89 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -87,6 +87,12 @@ struct vma_merge_struct {
struct anon_vma_name *anon_name;
enum vma_merge_flags merge_flags;
enum vma_merge_state state;
+
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
};
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
@@ -303,7 +309,8 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index b930ab12a587..1dd119f266e6 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -17,6 +17,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/huge_mm.h>
+#include <linux/hugetlb.h>
#include <linux/hugetlb_inline.h>
#include <linux/kernel.h>
#include <linux/khugepaged.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3f9255dfacb0..3519c4e4f841 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -487,6 +487,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
+ int err = 0;
pte_t *pte;
/*
@@ -500,18 +501,25 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(ptep_get(pte))))
- return -EBUSY;
- if (WARN_ON(!page))
- return -ENOMEM;
- if (WARN_ON(!pfn_valid(page_to_pfn(page))))
- return -EINVAL;
+ if (WARN_ON(!pte_none(ptep_get(pte)))) {
+ err = -EBUSY;
+ break;
+ }
+ if (WARN_ON(!page)) {
+ err = -ENOMEM;
+ break;
+ }
+ if (WARN_ON(!pfn_valid(page_to_pfn(page)))) {
+ err = -EINVAL;
+ break;
+ }
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
- return 0;
+
+ return err;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
@@ -586,13 +594,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
@@ -1940,7 +1948,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va_size(va);
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -3095,7 +3103,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3128,6 +3136,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -4067,6 +4076,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
*/
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
size_t old_size = 0;
void *n;
@@ -4076,15 +4087,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
}
if (p) {
- struct vm_struct *vm;
-
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
- old_size = get_vm_area_size(vm);
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
}
/*
@@ -4092,11 +4105,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags))
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
- kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
return (void *)p;
}
@@ -4918,28 +4946,29 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
@@ -4967,6 +4996,10 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_area *va;
struct vm_struct *v;
int i;
+ unsigned int *counters;
+
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
for (i = 0; i < nr_vmap_nodes; i++) {
vn = &vmap_nodes[i];
@@ -4983,6 +5016,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5017,7 +5055,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5027,19 +5067,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 67a680e4b484..e3c1e2e1560d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1053,7 +1053,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1079,6 +1079,21 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ /*
+ * unmap_poisoned_folio() can't handle large
+ * folio, just skip it. memory_failure() will
+ * handle it if the UCE is triggered again.
+ */
+ if (folio_test_large(folio))
+ goto keep_locked;
+
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1522,8 +1537,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_reclaimed += stat->nr_demoted;
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -4637,6 +4653,9 @@ retry:
reset_batch_size(walk);
}
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ stat.nr_demoted);
+
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
@@ -7553,7 +7572,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 16a07def09c9..e4326af00e5e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -976,6 +976,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
migrate_lock_init(zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0030ce8fecfc..00d51d013757 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -812,36 +815,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -850,12 +858,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
@@ -863,18 +876,60 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
+
+ mutex_lock(&acomp_ctx->mutex);
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
+ mutex_unlock(&acomp_ctx->mutex);
+
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
{
struct crypto_acomp_ctx *acomp_ctx;
@@ -887,10 +942,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_folio(&input, folio, PAGE_SIZE, 0);
@@ -943,7 +995,7 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -954,9 +1006,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -980,10 +1030,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************