summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/damon/core.c10
-rw-r--r--mm/debug.c7
-rw-r--r--mm/filemap.c17
-rw-r--r--mm/gup.c11
-rw-r--r--mm/huge_memory.c31
-rw-r--r--mm/hugetlb.c35
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kasan/kasan_test_c.c2
-rw-r--r--mm/kasan/report.c6
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol-v1.h2
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c18
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mremap.c32
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/readahead.c11
-rw-r--r--mm/shmem.c37
-rw-r--r--mm/slub.c21
-rw-r--r--mm/util.c7
-rw-r--r--mm/vma.c7
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/zswap.c90
32 files changed, 309 insertions, 139 deletions
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8b8e2933dcd4..0776452a1abb 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src)
NUMA_NO_NODE);
if (!new_scheme)
return -ENOMEM;
+ err = damos_commit(new_scheme, src_scheme);
+ if (err) {
+ damon_destroy_scheme(new_scheme);
+ return err;
+ }
damon_add_scheme(dst, new_scheme);
}
return 0;
@@ -961,8 +966,11 @@ static int damon_commit_targets(
return -ENOMEM;
err = damon_commit_target(new_target, false,
src_target, damon_target_has_pid(src));
- if (err)
+ if (err) {
+ damon_destroy_target(new_target);
return err;
+ }
+ damon_add_target(dst, new_target);
}
return 0;
}
diff --git a/mm/debug.c b/mm/debug.c
index aa57d3ffd4ed..95b6ab809c0e 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -124,19 +124,22 @@ static void __dump_page(const struct page *page)
{
struct folio *foliop, folio;
struct page precise;
+ unsigned long head;
unsigned long pfn = page_to_pfn(page);
unsigned long idx, nr_pages = 1;
int loops = 5;
again:
memcpy(&precise, page, sizeof(*page));
- foliop = page_folio(&precise);
- if (foliop == (struct folio *)&precise) {
+ head = precise.compound_head;
+ if ((head & 1) == 0) {
+ foliop = (struct folio *)&precise;
idx = 0;
if (!folio_test_large(foliop))
goto dump;
foliop = (struct folio *)page;
} else {
+ foliop = (struct folio *)(head - 1);
idx = folio_page_idx(foliop, page);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 7c76a123ba18..4f476411a9a2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,15 +124,6 @@
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
-static void mapping_set_update(struct xa_state *xas,
- struct address_space *mapping)
-{
- if (dax_mapping(mapping) || shmem_mapping(mapping))
- return;
- xas_set_update(xas, workingset_update_node);
- xas_set_lru(xas, &shadow_nodes);
-}
-
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -1532,7 +1523,7 @@ void folio_end_read(struct folio *folio, bool success)
/* Must be in bottom byte for x86 to work */
BUILD_BUG_ON(PG_uptodate > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+ VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
if (likely(success))
mask |= 1 << PG_uptodate;
@@ -3005,7 +2996,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
- start = (start + bsz) & ~(bsz - 1);
+ start = (start + bsz) & ~((u64)bsz - 1);
offset += bsz;
} while (offset < folio_size(folio));
unlock:
@@ -3501,10 +3492,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas,
continue;
if (xa_is_value(folio))
continue;
- if (folio_test_locked(folio))
- continue;
if (!folio_try_get(folio))
continue;
+ if (folio_test_locked(folio))
+ goto skip;
/* Has the page moved or been split? */
if (unlikely(folio != xas_reload(xas)))
goto skip;
diff --git a/mm/gup.c b/mm/gup.c
index 746070a1d8bf..3b75e631f369 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -52,7 +52,12 @@ static inline void sanity_check_pinned_pages(struct page **pages,
*/
for (; npages; npages--, pages++) {
struct page *page = *pages;
- struct folio *folio = page_folio(page);
+ struct folio *folio;
+
+ if (!page)
+ continue;
+
+ folio = page_folio(page);
if (is_zero_page(page) ||
!folio_test_anon(folio))
@@ -409,6 +414,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
+ if (!pages[i]) {
+ nr = 1;
+ continue;
+ }
folio = gup_folio_next(pages, npages, i, &nr);
gup_put_folio(folio, nr, FOLL_PIN);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ee335d96fc39..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
folio_throttle_swaprate(folio, gfp);
/*
- * When a folio is not zeroed during allocation (__GFP_ZERO not used),
- * folio_zero_user() is used to make sure that the page corresponding
- * to the faulting address will be hot in the cache after zeroing.
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used)
+ * or user folios require special handling, folio_zero_user() is used to
+ * make sure that the page corresponding to the faulting address will be
+ * hot in the cache after zeroing.
*/
- if (!alloc_zeroed())
+ if (user_alloc_needs_zeroing())
folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
@@ -2205,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2243,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
@@ -3576,7 +3589,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -3688,7 +3701,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -3732,7 +3745,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
- __folio_set_partially_mapped(folio);
+ folio_set_partially_mapped(folio);
if (folio_test_pmd_mappable(folio))
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
@@ -3825,7 +3838,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with folio_put() */
if (folio_test_partially_mapped(folio)) {
- __folio_clear_partially_mapped(folio);
+ folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
@@ -4168,7 +4181,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
size_t input_len = strlen(input_buf);
tok = strsep(&buf, ",");
- if (tok) {
+ if (tok && buf) {
strscpy(file_path, tok);
} else {
ret = -EINVAL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ea2ed8e301ef..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5340,7 +5340,7 @@ again:
break;
}
ret = copy_user_large_folio(new_folio, pte_folio,
- ALIGN_DOWN(addr, sz), dst_vma);
+ addr, dst_vma);
folio_put(pte_folio);
if (ret) {
folio_put(new_folio);
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
@@ -6643,8 +6655,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
*foliop = NULL;
goto out;
}
- ret = copy_user_large_folio(folio, *foliop,
- ALIGN_DOWN(dst_addr, size), dst_vma);
+ ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
folio_put(*foliop);
*foliop = NULL;
if (ret) {
@@ -7212,7 +7223,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
- get_page(virt_to_page(spte));
+ ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
break;
}
}
@@ -7227,7 +7238,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
- put_page(virt_to_page(spte));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
}
spin_unlock(&mm->page_table_lock);
out:
@@ -7239,10 +7250,6 @@ out:
/*
* unmap huge page backed by shared pte.
*
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
@@ -7251,18 +7258,20 @@ out:
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
+ if (sz != PMD_SIZE)
+ return 0;
+ if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
- put_page(virt_to_page(ptep));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
}
diff --git a/mm/internal.h b/mm/internal.h
index cb8d8e8e3ffa..9826f7dce607 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
-static inline bool alloc_zeroed(void)
-{
- return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
- &init_on_alloc);
-}
-
/*
* Parses a string with mem suffixes into its order. Useful to parse kernel
* parameters.
@@ -1510,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
+#define mapping_set_update(xas, mapping) do { \
+ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \
+ xas_set_update(xas, workingset_update_node); \
+ xas_set_lru(xas, &shadow_nodes); \
+ } \
+} while (0)
/* mremap.c */
unsigned long move_page_tables(struct vm_area_struct *vma,
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index e0ec5a6d15be..99d4ff0ed57a 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -33,7 +33,7 @@
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
-MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
static bool multishot;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 50fb19ad4388..3fe77a360f1c 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -201,7 +201,7 @@ static inline void fail_non_kasan_kunit_test(void) { }
#endif /* CONFIG_KUNIT */
-static DEFINE_SPINLOCK(report_lock);
+static DEFINE_RAW_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
@@ -212,7 +212,7 @@ static void start_report(unsigned long *flags, bool sync)
lockdep_off();
/* Make sure we don't end up in loop. */
report_suppress_start();
- spin_lock_irqsave(&report_lock, *flags);
+ raw_spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
@@ -222,7 +222,7 @@ static void end_report(unsigned long *flags, const void *addr, bool is_write)
trace_error_report_end(ERROR_DETECTOR_KASAN,
(unsigned long)addr);
pr_err("==================================================================\n");
- spin_unlock_irqrestore(&report_lock, *flags);
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
check_panic_on_warn("KASAN");
switch (kasan_arg_fault) {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6f8d46d107b4..bad1e130eda8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -19,6 +19,7 @@
#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <asm/tlb.h>
@@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
if (result != SCAN_SUCCEED)
goto out;
+ mapping_set_update(&xas, mapping);
+
__folio_set_locked(new_folio);
if (is_shmem)
__folio_set_swapbacked(new_folio);
@@ -2419,7 +2422,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2765,7 +2768,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2a945c07ae99..820ba3b5cbfc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq,
for (i = 0; i < nr_entries; i++) {
void *ptr = (void *)entries[i];
- warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " %pS\n", ptr);
}
}
@@ -1093,7 +1093,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
- create_object_percpu((__force unsigned long)ptr, size, 0, gfp);
+ create_object_percpu((__force unsigned long)ptr, size, 1, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f93ada6a207b..7d69434c70e0 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -77,7 +77,6 @@ again:
spin_lock(&l->lock);
nr_items = READ_ONCE(l->nr_items);
if (likely(nr_items != LONG_MIN)) {
- WARN_ON(nr_items < 0);
rcu_read_unlock();
return l;
}
@@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
list_splice_init(&src->list, &dst->list);
if (src->nr_items) {
+ WARN_ON(src->nr_items < 0);
dst->nr_items += src->nr_items;
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 0389ce5cd281..095c18b5c430 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -735,7 +735,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
/**
* memblock_validate_numa_coverage - check if amount of memory with
* no node ID assigned is less than a threshold
- * @threshold_bytes: maximal number of pages that can have unassigned node
+ * @threshold_bytes: maximal memory size that can have unassigned node
* ID (in bytes).
*
* A buggy firmware may report memory that does not belong to any node.
@@ -755,7 +755,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
nr_pages += end_pfn - start_pfn;
}
- if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) {
+ if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
mem_size_mb = memblock_phys_mem_size() >> 20;
pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
(nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 0e3b82951d91..144d71b65907 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -38,7 +38,7 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
iter = mem_cgroup_iter(NULL, iter, NULL))
/* Whether legacy memory+swap accounting is active */
-static bool do_memsw_account(void)
+static inline bool do_memsw_account(void)
{
return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index c17c3ea701a1..35a370d75c9a 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
return error;
}
-static unsigned int *memfd_file_seals_ptr(struct file *file)
+unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
diff --git a/mm/memory.c b/mm/memory.c
index 75c2dfd04f72..398c031be9ba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
folio_throttle_swaprate(folio, gfp);
/*
* When a folio is not zeroed during allocation
- * (__GFP_ZERO not used), folio_zero_user() is used
- * to make sure that the page corresponding to the
- * faulting address will be hot in the cache after
- * zeroing.
+ * (__GFP_ZERO not used) or user folios require special
+ * handling, folio_zero_user() is used to make sure
+ * that the page corresponding to the faulting address
+ * will be hot in the cache after zeroing.
*/
- if (!alloc_zeroed())
+ if (user_alloc_needs_zeroing())
folio_zero_user(folio, vmf->address);
return folio;
}
@@ -6815,9 +6815,10 @@ static inline int process_huge_page(
return 0;
}
-static void clear_gigantic_page(struct folio *folio, unsigned long addr,
+static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
unsigned int nr_pages)
{
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
int i;
might_sleep();
@@ -6851,13 +6852,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
}
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
- unsigned long addr,
+ unsigned long addr_hint,
struct vm_area_struct *vma,
unsigned int nr_pages)
{
- int i;
+ unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
struct page *dst_page;
struct page *src_page;
+ int i;
for (i = 0; i < nr_pages; i++) {
dst_page = folio_page(dst, i);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb37cd1a51d8..162407fbf2bc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1080,6 +1080,10 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
mmap_read_lock(mm);
vma = find_vma(mm, 0);
+ if (unlikely(!vma)) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
/*
* This does not migrate the range, but isolates all pages that
@@ -2264,7 +2268,8 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
page = __alloc_pages_noprof(gfp, order, nid, nodemask);
- if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
if (static_branch_likely(&vm_numa_stat_key) &&
page_to_nid(page) == nid) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 2ce6b4b814df..cc68583c86f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -745,7 +745,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_readahead(newfolio);
folio_copy_owner(newfolio, folio);
- pgalloc_tag_copy(newfolio, folio);
+ pgalloc_tag_swap(newfolio, folio);
mem_cgroup_migrate(folio, newfolio);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 386429f7db5a..aec208f90337 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -47,6 +47,7 @@
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
+#include <linux/memfd.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
+ unsigned int seals = memfd_file_seals(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
@@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+ else if (is_readonly_sealed(seals, vm_flags))
+ vm_flags &= ~VM_MAYWRITE;
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
@@ -888,7 +892,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (get_area) {
addr = get_area(file, addr, len, pgoff, flags);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
+ && !addr /* no hint */
&& IS_ALIGNED(len, PMD_SIZE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
addr = thp_get_unmapped_area_vmflags(file, addr, len,
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d213ead95675..d9861e42b2bd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -692,6 +692,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages)
unsigned long ratio;
global_dirty_limits(&background_thresh, &dirty_thresh);
+ if (!dirty_thresh)
+ return -EINVAL;
ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
return ratio;
@@ -790,13 +792,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
int ret;
unsigned long pages = min_bytes >> PAGE_SHIFT;
- unsigned long min_ratio;
+ long min_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
min_ratio = bdi_ratio_from_pages(pages);
+ if (min_ratio < 0)
+ return min_ratio;
return __bdi_set_min_ratio(bdi, min_ratio);
}
@@ -809,13 +813,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
int ret;
unsigned long pages = max_bytes >> PAGE_SHIFT;
- unsigned long max_ratio;
+ long max_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
max_ratio = bdi_ratio_from_pages(pages);
+ if (max_ratio < 0)
+ return max_ratio;
return __bdi_set_max_ratio(bdi, max_ratio);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1cb4b8c8886d..01eab25edf89 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1238,13 +1238,15 @@ static void split_large_buddy(struct zone *zone, struct page *page,
if (order > pageblock_order)
order = pageblock_order;
- while (pfn != end) {
+ do {
int mt = get_pfnblock_migratetype(page, pfn);
__free_one_page(page, pfn, zone, order, mt, fpi);
pfn += 1 << order;
+ if (pfn == end)
+ break;
page = pfn_to_page(pfn);
- }
+ } while (1);
}
static void free_one_page(struct zone *zone, struct page *page,
@@ -5690,10 +5692,13 @@ __meminit void zone_pcp_init(struct zone *zone)
zone->present_pages, zone_batchsize(zone));
}
+static void setup_per_zone_lowmem_reserve(void);
+
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
totalram_pages_add(count);
+ setup_per_zone_lowmem_reserve();
}
EXPORT_SYMBOL(adjust_managed_page_count);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5297dcc38c37..5a882f2b10f9 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif
-pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
unsigned long irqflags;
pmd_t pmdval;
diff --git a/mm/readahead.c b/mm/readahead.c
index 8f1cf599b572..e151f4b13ca4 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -458,8 +458,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
struct file_ra_state *ra, unsigned int new_order)
{
struct address_space *mapping = ractl->mapping;
- pgoff_t start = readahead_index(ractl);
- pgoff_t index = start;
+ pgoff_t index = readahead_index(ractl);
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
pgoff_t mark = index + ra->size - ra->async_size;
@@ -522,7 +521,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
if (!err)
return;
fallback:
- do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size);
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
}
static unsigned long ractl_max_pages(struct readahead_control *ractl,
@@ -647,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
1UL << order);
if (index == expected) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max_pages);
+ /*
+ * In the case of MADV_HUGEPAGE, the actual size might exceed
+ * the readahead window.
+ */
+ ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
ra->async_size = ra->size;
goto readit;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 274c2666f457..532afd8e049c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -787,6 +787,14 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void shmem_update_stats(struct folio *folio, int nr_pages)
+{
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
+}
+
/*
* Somewhat like filemap_add_folio, but error if expected item has gone.
*/
@@ -821,10 +829,7 @@ static int shmem_add_to_page_cache(struct folio *folio,
xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ shmem_update_stats(folio, nr);
mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
@@ -852,8 +857,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
error = shmem_replace_entry(mapping, folio->index, folio, radswap);
folio->mapping = NULL;
mapping->nrpages -= nr;
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ shmem_update_stats(folio, -nr);
xa_unlock_irq(&mapping->i_pages);
folio_put_refs(folio, nr);
BUG_ON(error);
@@ -1531,7 +1535,7 @@ try_split:
!shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
- shmem_falloc->nr_unswapped++;
+ shmem_falloc->nr_unswapped += nr_pages;
else
shmem_falloc = NULL;
spin_unlock(&inode->i_lock);
@@ -1685,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
unsigned long vm_flags = vma ? vma->vm_flags : 0;
+ pgoff_t aligned_index;
bool global_huge;
loff_t i_size;
int order;
@@ -1719,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
/* Allow mTHP that will be fully within i_size. */
order = highest_order(within_size_orders);
while (within_size_orders) {
- index = round_up(index + 1, order);
+ aligned_index = round_up(index + 1, 1 << order);
i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index) {
+ if (i_size >> PAGE_SHIFT >= aligned_index) {
mask |= within_size_orders;
break;
}
@@ -1969,10 +1974,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
if (!error) {
mem_cgroup_replace_folio(old, new);
- __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
- __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
- __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
- __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
+ shmem_update_stats(new, nr_pages);
+ shmem_update_stats(old, -nr_pages);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -3914,6 +3917,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len;
struct inode *inode;
struct folio *folio;
+ char *link;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3935,12 +3939,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- inode->i_link = kmemdup(symname, len, GFP_KERNEL);
- if (!inode->i_link) {
+ link = kmemdup(symname, len, GFP_KERNEL);
+ if (!link) {
error = -ENOMEM;
goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
+ inode_set_cached_link(inode, link, len - 1);
} else {
inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops;
@@ -4365,7 +4370,7 @@ static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *
bool latest_version)
{
struct shmem_options *ctx = fc->fs_private;
- unsigned int version = UTF8_LATEST;
+ int version = UTF8_LATEST;
struct unicode_map *encoding;
char *version_str = param->string + 5;
diff --git a/mm/slub.c b/mm/slub.c
index 19980419b176..c2151c9fee22 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2189,9 +2189,24 @@ bool memcg_slab_post_charge(void *p, gfp_t flags)
folio = virt_to_folio(p);
if (!folio_test_slab(folio)) {
- return folio_memcg_kmem(folio) ||
- (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
- folio_order(folio)) == 0);
+ int size;
+
+ if (folio_memcg_kmem(folio))
+ return true;
+
+ if (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
+ folio_order(folio)))
+ return false;
+
+ /*
+ * This folio has already been accounted in the global stats but
+ * not in the memcg stats. So, subtract from the global and use
+ * the interface which adds to both global and memcg stats.
+ */
+ size = folio_size(folio);
+ node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size);
+ lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size);
+ return true;
}
slab = folio_slab(folio);
diff --git a/mm/util.c b/mm/util.c
index c1c3b06ab4f9..60aa40f612b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len)
{
char *p;
- /*
- * Always use GFP_KERNEL, since copy_from_user() can sleep and
- * cause pagefault, which makes it pointless to use GFP_NOFS
- * or GFP_ATOMIC.
- */
- p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+ p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
diff --git a/mm/vma.c b/mm/vma.c
index 8a454a7bbc80..bb2119e5a0d0 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -35,7 +35,7 @@ struct mmap_state {
.mm = mm_, \
.vmi = vmi_, \
.addr = addr_, \
- .end = (addr_) + len, \
+ .end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
.flags = flags_, \
@@ -2460,10 +2460,13 @@ unsigned long __mmap_region(struct file *file, unsigned long addr,
/* If flags changed, we might be able to merge, so try again. */
if (map.retry_merge) {
+ struct vm_area_struct *merged;
VMG_MMAP_STATE(vmg, &map, vma);
vma_iter_config(map.vmi, map.addr, map.end);
- vma_merge_existing_range(&vmg);
+ merged = vma_merge_existing_range(&vmg);
+ if (merged)
+ vma = merged;
}
__mmap_complete(&map, vma);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7ed39d104201..5c88d0e90c20 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3374,7 +3374,8 @@ void vfree(const void *addr)
struct page *page = vm->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3382,7 +3383,8 @@ void vfree(const void *addr)
__free_page(page);
cond_resched();
}
- atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -4093,7 +4095,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
/* Zero out spare memory. */
if (want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
-
+ kasan_poison_vmalloc(p + size, old_size - size);
+ kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
return (void *)p;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e3..b1ec5ece067e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
+ /*
+ * If there are no reclaimable file-backed or anonymous pages,
+ * ensure zones with sufficient free pages are not skipped.
+ * This prevents zones like DMA32 from being ignored in reclaim
+ * scenarios where they can still help alleviate memory pressure.
+ */
+ if (nr == 0)
+ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
return nr;
}
@@ -4635,6 +4642,9 @@ retry:
reset_batch_size(walk);
}
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ stat.nr_demoted);
+
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4d016314a56c..16bfe1c694dd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2122,10 +2122,20 @@ static void __init start_shepherd_timer(void)
{
int cpu;
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
+ /*
+ * For secondary CPUs during CPU hotplug scenarios,
+ * vmstat_cpu_online() will enable the work.
+ * mm/vmstat:online enables and disables vmstat_work
+ * symmetrically during CPU hotplug events.
+ */
+ if (!cpu_online(cpu))
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ }
+
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -2148,13 +2158,14 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
}
+ enable_delayed_work(&per_cpu(vmstat_work, cpu));
return 0;
}
static int vmstat_cpu_down_prep(unsigned int cpu)
{
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
return 0;
}
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..b84c20d889b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -817,36 +820,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
+ ret = -ENOMEM;
+ goto fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -855,12 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
@@ -869,17 +882,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ mutex_lock(&acomp_ctx->mutex);
if (!IS_ERR_OR_NULL(acomp_ctx)) {
if (!IS_ERR_OR_NULL(acomp_ctx->req))
acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
kfree(acomp_ctx->buffer);
}
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -893,10 +934,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -949,7 +987,7 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -960,9 +998,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -986,10 +1022,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************