summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/compaction.c10
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/hugetlb.c27
-rw-r--r--mm/kasan/hw_tags.c5
-rw-r--r--mm/kasan/kasan.h10
-rw-r--r--mm/kasan/quarantine.c7
-rw-r--r--mm/kfence/core.c31
-rw-r--r--mm/kfence/kfence.h21
-rw-r--r--mm/kfence/report.c47
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/memcontrol.c12
-rw-r--r--mm/memory-failure.c147
-rw-r--r--mm/mempolicy.c39
-rw-r--r--mm/migrate.c80
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mmu_notifier.c14
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c54
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/page_io.c58
-rw-r--r--mm/page_vma_mapped.c6
-rw-r--r--mm/readahead.c16
-rw-r--r--mm/secretmem.c17
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c9
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swapfile.c32
-rw-r--r--mm/userfaultfd.c15
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c64
-rw-r--r--mm/workingset.c2
39 files changed, 497 insertions, 358 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7176af65b103..ff60bd7d74e0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
@@ -390,7 +391,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
@@ -401,7 +401,7 @@ static void cgwb_release_workfn(struct work_struct *work)
mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
- blkcg_unpin_online(blkcg);
+ blkcg_unpin_online(wb->blkcg_css);
fprop_local_destroy_percpu(&wb->memcg_completions);
@@ -446,7 +446,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
- struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
@@ -454,9 +453,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
- blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
- blkcg_cgwb_list = &blkcg->cgwb_list;
+ blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
@@ -511,7 +509,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
- blkcg_pin_online(blkcg);
+ blkcg_pin_online(blkcg_css);
css_get(memcg_css);
css_get(blkcg_css);
}
@@ -724,18 +722,19 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
- * @blkcg: blkcg being offlined
+ * @css: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
-void wb_blkcg_offline(struct blkcg *blkcg)
+void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
struct bdi_writeback *wb, *next;
+ struct list_head *list = blkcg_get_cgwb_list(css);
spin_lock_irq(&cgwb_lock);
- list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ list_for_each_entry_safe(wb, next, list, blkcg_node)
cgwb_kill(wb);
- blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
diff --git a/mm/compaction.c b/mm/compaction.c
index c3e37aa9ff9e..fe915db6149b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -26,6 +26,11 @@
#include "internal.h"
#ifdef CONFIG_COMPACTION
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500)
+
static inline void count_compact_event(enum vm_event_item item)
{
count_vm_event(item);
@@ -51,11 +56,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
/*
- * Fragmentation score check interval for proactive compaction purposes.
- */
-static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
-
-/*
* Page order with-respect-to which proactive compaction
* calculates external fragmentation, which is used as
* the "fragmentation score" of a node/zone.
diff --git a/mm/filemap.c b/mm/filemap.c
index 3a5ffb5587cd..9a1eef6c5d35 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1063,12 +1063,6 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
-
- /*
- * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date,
- * and splice's page_cache_pipe_buf_confirm() needs to see that.
- */
- SetPageUptodate(ZERO_PAGE(0));
}
/*
diff --git a/mm/highmem.c b/mm/highmem.c
index 0cc0c4da7ed9..1a692997fac4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -624,7 +624,7 @@ void __kmap_local_sched_out(void)
/* With debug all even slots are unmapped and act as guard */
if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
- WARN_ON_ONCE(!pte_none(pteval));
+ WARN_ON_ONCE(pte_val(pteval) != 0);
continue;
}
if (WARN_ON_ONCE(pte_none(pteval)))
@@ -661,7 +661,7 @@ void __kmap_local_sched_in(void)
/* With debug all even slots are unmapped and act as guard */
if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
- WARN_ON_ONCE(!pte_none(pteval));
+ WARN_ON_ONCE(pte_val(pteval) != 0);
continue;
}
if (WARN_ON_ONCE(pte_none(pteval)))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2fe38212e07c..910a138e9859 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2145,15 +2145,14 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmd against. Otherwise we can end up replacing wrong folio.
*/
VM_BUG_ON(freeze && !folio);
- if (folio) {
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- if (folio != page_folio(pmd_page(*pmd)))
- goto out;
- }
+ VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd))
+ is_pmd_migration_entry(*pmd)) {
+ if (folio && folio != page_folio(pmd_page(*pmd)))
+ goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
+ }
out:
spin_unlock(ptl);
@@ -2496,11 +2495,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
+ bool is_hzp;
- VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(head), head);
VM_BUG_ON_PAGE(!PageCompound(head), head);
+ is_hzp = is_huge_zero_page(head);
+ VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ if (is_hzp)
+ return -EBUSY;
+
if (PageWriteback(head))
return -EBUSY;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b34f50156f7e..3fc721789743 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3475,7 +3475,6 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
int nr_nodes, node;
struct page *page;
- int rc = 0;
lockdep_assert_held(&hugetlb_lock);
@@ -3486,15 +3485,19 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
}
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
- if (!list_empty(&h->hugepage_freelists[node])) {
- page = list_entry(h->hugepage_freelists[node].next,
- struct page, lru);
- rc = demote_free_huge_page(h, page);
- break;
+ list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
+ if (PageHWPoison(page))
+ continue;
+
+ return demote_free_huge_page(h, page);
}
}
- return rc;
+ /*
+ * Only way to get here is if all pages on free lists are poisoned.
+ * Return -EBUSY so that caller will not retry.
+ */
+ return -EBUSY;
}
#define HSTATE_ATTR_RO(_name) \
@@ -6782,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return ret;
}
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+ int ret;
+
+ spin_lock_irq(&hugetlb_lock);
+ ret = __get_huge_page_for_hwpoison(pfn, flags);
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 07a76c46daa5..9e1b6544bfa8 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -336,8 +336,6 @@ void __kasan_poison_vmalloc(const void *start, unsigned long size)
#endif
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-
void kasan_enable_tagging(void)
{
if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
@@ -347,6 +345,9 @@ void kasan_enable_tagging(void)
else
hw_enable_tagging_sync();
}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
EXPORT_SYMBOL_GPL(kasan_enable_tagging);
void kasan_force_async_fault(void)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d79b83d673b1..b01b4bbe0409 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -355,25 +355,27 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_set_mem_tag_range(addr, size, tag, init) \
arch_set_mem_tag_range((addr), (size), (tag), (init))
+void kasan_enable_tagging(void);
+
#else /* CONFIG_KASAN_HW_TAGS */
#define hw_enable_tagging_sync()
#define hw_enable_tagging_async()
#define hw_enable_tagging_asymm()
+static inline void kasan_enable_tagging(void) { }
+
#endif /* CONFIG_KASAN_HW_TAGS */
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_enable_tagging(void);
void kasan_force_async_fault(void);
-#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
-static inline void kasan_enable_tagging(void) { }
static inline void kasan_force_async_fault(void) { }
-#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */
#ifdef CONFIG_KASAN_SW_TAGS
u8 kasan_random_tag(void);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 08291ed33e93..0a9def8ce5e8 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -315,6 +315,13 @@ static void per_cpu_remove_cache(void *arg)
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
+ /*
+ * Ensure the ordering between the writing to q->offline and
+ * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted
+ * by interrupt.
+ */
+ if (READ_ONCE(q->offline))
+ return;
qlist_move_cache(q, &to_free, cache);
qlist_free_all(&to_free, cache);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index a203747ad2c0..11a954763be9 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -231,27 +231,6 @@ static bool kfence_unprotect(unsigned long addr)
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
}
-static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
-{
- long index;
-
- /* The checks do not affect performance; only called from slow-paths. */
-
- if (!is_kfence_address((void *)addr))
- return NULL;
-
- /*
- * May be an invalid index if called with an address at the edge of
- * __kfence_pool, in which case we would report an "invalid access"
- * error.
- */
- index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
- if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
- return NULL;
-
- return &kfence_metadata[index];
-}
-
static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
{
unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
@@ -642,6 +621,16 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
+ for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
+ struct slab *slab = virt_to_slab(p);
+
+ if (!slab)
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 9a6c4b1b12a8..600f2e2431d6 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -96,6 +96,27 @@ struct kfence_metadata {
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+ long index;
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ if (!is_kfence_address((void *)addr))
+ return NULL;
+
+ /*
+ * May be an invalid index if called with an address at the edge of
+ * __kfence_pool, in which case we would report an "invalid access"
+ * error.
+ */
+ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+ if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+ return NULL;
+
+ return &kfence_metadata[index];
+}
+
/* KFENCE error types for report generation. */
enum kfence_error_type {
KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index f93a7b2a338b..f5a6d8ba3e21 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -273,3 +273,50 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
/* We encountered a memory safety error, taint the kernel! */
add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
}
+
+#ifdef CONFIG_PRINTK
+static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack)
+{
+ int i, j;
+
+ i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+ for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j)
+ kp_stack[j] = (void *)track->stack_entries[i];
+ if (j < KS_ADDRS_COUNT)
+ kp_stack[j] = NULL;
+}
+
+bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+{
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)object);
+ unsigned long flags;
+
+ if (!meta)
+ return false;
+
+ /*
+ * If state is UNUSED at least show the pointer requested; the rest
+ * would be garbage data.
+ */
+ kpp->kp_ptr = object;
+
+ /* Requesting info an a never-used object is almost certainly a bug. */
+ if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED))
+ return true;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+
+ kpp->kp_slab = slab;
+ kpp->kp_slab_cache = meta->cache;
+ kpp->kp_objp = (void *)meta->addr;
+ kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack);
+ if (meta->state == KFENCE_OBJECT_FREED)
+ kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack);
+ /* get_stack_skipnr() ensures the first entry is outside allocator. */
+ kpp->kp_ret = kpp->kp_stack[0];
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ return true;
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index acd7cbb82e16..a182f5ddaf68 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1132,7 +1132,7 @@ EXPORT_SYMBOL(kmemleak_no_scan);
void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
gfp_t gfp)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
kmemleak_alloc(__va(phys), size, min_count, gfp);
}
EXPORT_SYMBOL(kmemleak_alloc_phys);
@@ -1146,7 +1146,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
kmemleak_free_part(__va(phys), size);
}
EXPORT_SYMBOL(kmemleak_free_part_phys);
@@ -1158,7 +1158,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
*/
void __ref kmemleak_not_leak_phys(phys_addr_t phys)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
kmemleak_not_leak(__va(phys));
}
EXPORT_SYMBOL(kmemleak_not_leak_phys);
@@ -1170,7 +1170,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
kmemleak_ignore(__va(phys));
}
EXPORT_SYMBOL(kmemleak_ignore_phys);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index c669d87001a6..ba76428ceece 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -395,12 +395,6 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
struct list_lru_one *src, *dst;
/*
- * If there is no lru entry in this nlru, we can skip it immediately.
- */
- if (!READ_ONCE(nlru->nr_items))
- return;
-
- /*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 725f76723220..598fece89e2b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
/*
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
@@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void)
if (!spin_trylock_irqsave(&stats_flush_lock, flag))
return;
+ flush_next_time = jiffies_64 + 2*FLUSH_TIME;
cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
spin_unlock_irqrestore(&stats_flush_lock, flag);
@@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void)
__mem_cgroup_flush_stats();
}
+void mem_cgroup_flush_stats_delayed(void)
+{
+ if (time_after64(jiffies_64, flush_next_time))
+ mem_cgroup_flush_stats();
+}
+
static void flush_memcg_stats_dwork(struct work_struct *w)
{
__mem_cgroup_flush_stats();
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dcb6bb9cf731..d4a4adcca01f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1274,7 +1274,7 @@ try_again:
}
out:
if (ret == -EIO)
- dump_page(p, "hwpoison: unhandlable page");
+ pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
return ret;
}
@@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0;
}
-static int memory_failure_hugetlb(unsigned long pfn, int flags)
+/*
+ * Called from hugetlb code with hugetlb_lock held.
+ *
+ * Return values:
+ * 0 - free hugepage
+ * 1 - in-use hugepage
+ * 2 - not a hugepage
+ * -EBUSY - the hugepage is busy (try to retry)
+ * -EHWPOISON - the hugepage is already hwpoisoned
+ */
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+ struct page *page = pfn_to_page(pfn);
+ struct page *head = compound_head(page);
+ int ret = 2; /* fallback to normal page handling */
+ bool count_increased = false;
+
+ if (!PageHeadHuge(head))
+ goto out;
+
+ if (flags & MF_COUNT_INCREASED) {
+ ret = 1;
+ count_increased = true;
+ } else if (HPageFreed(head) || HPageMigratable(head)) {
+ ret = get_page_unless_zero(head);
+ if (ret)
+ count_increased = true;
+ } else {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (TestSetPageHWPoison(head)) {
+ ret = -EHWPOISON;
+ goto out;
+ }
+
+ return ret;
+out:
+ if (count_increased)
+ put_page(head);
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Taking refcount of hugetlb pages needs extra care about race conditions
+ * with basic operations like hugepage allocation/free/demotion.
+ * So some of prechecks for hwpoison (pinning, and testing/setting
+ * PageHWPoison) should be done in single hugetlb_lock range.
+ */
+static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
- struct page *p = pfn_to_page(pfn);
- struct page *head = compound_head(p);
int res;
+ struct page *p = pfn_to_page(pfn);
+ struct page *head;
unsigned long page_flags;
+ bool retry = true;
- if (TestSetPageHWPoison(head)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
- res = -EHWPOISON;
- if (flags & MF_ACTION_REQUIRED)
+ *hugetlb = 1;
+retry:
+ res = get_huge_page_for_hwpoison(pfn, flags);
+ if (res == 2) { /* fallback to normal page handling */
+ *hugetlb = 0;
+ return 0;
+ } else if (res == -EHWPOISON) {
+ pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+ if (flags & MF_ACTION_REQUIRED) {
+ head = compound_head(p);
res = kill_accessing_process(current, page_to_pfn(head), flags);
+ }
return res;
+ } else if (res == -EBUSY) {
+ if (retry) {
+ retry = false;
+ goto retry;
+ }
+ action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
+ return res;
+ }
+
+ head = compound_head(p);
+ lock_page(head);
+
+ if (hwpoison_filter(p)) {
+ ClearPageHWPoison(head);
+ res = -EOPNOTSUPP;
+ goto out;
}
num_poisoned_pages_inc();
- if (!(flags & MF_COUNT_INCREASED)) {
- res = get_hwpoison_page(p, flags);
- if (!res) {
- lock_page(head);
- if (hwpoison_filter(p)) {
- if (TestClearPageHWPoison(head))
- num_poisoned_pages_dec();
- unlock_page(head);
- return -EOPNOTSUPP;
- }
- unlock_page(head);
- res = MF_FAILED;
- if (__page_handle_poison(p)) {
- page_ref_inc(p);
- res = MF_RECOVERED;
- }
- action_result(pfn, MF_MSG_FREE_HUGE, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
- } else if (res < 0) {
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- return -EBUSY;
+ /*
+ * Handling free hugepage. The possible race with hugepage allocation
+ * or demotion can be prevented by PageHWPoison flag.
+ */
+ if (res == 0) {
+ unlock_page(head);
+ res = MF_FAILED;
+ if (__page_handle_poison(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
}
+ action_result(pfn, MF_MSG_FREE_HUGE, res);
+ return res == MF_RECOVERED ? 0 : -EBUSY;
}
- lock_page(head);
-
/*
* The page could have changed compound pages due to race window.
* If this happens just bail out.
@@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
page_flags = head->flags;
- if (hwpoison_filter(p)) {
- if (TestClearPageHWPoison(head))
- num_poisoned_pages_dec();
- put_page(p);
- res = -EOPNOTSUPP;
- goto out;
- }
-
/*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need
@@ -1588,6 +1643,12 @@ out:
unlock_page(head);
return res;
}
+#else
+static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+{
+ return 0;
+}
+#endif
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
@@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags)
int res = 0;
unsigned long page_flags;
bool retry = true;
+ int hugetlb = 0;
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags)
}
try_again:
- if (PageHuge(p)) {
- res = memory_failure_hugetlb(pfn, flags);
+ res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
+ if (hugetlb)
goto unlock_mutex;
- }
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a2516d31db6c..8c74107a2b15 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1191,8 +1191,10 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
*/
static struct page *new_page(struct page *page, unsigned long start)
{
+ struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
+ gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1202,24 +1204,19 @@ static struct page *new_page(struct page *page, unsigned long start)
vma = vma->vm_next;
}
- if (PageHuge(page)) {
- return alloc_huge_page_vma(page_hstate(compound_head(page)),
+ if (folio_test_hugetlb(src))
+ return alloc_huge_page_vma(page_hstate(&src->page),
vma, address);
- } else if (PageTransHuge(page)) {
- struct page *thp;
- thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- }
+ if (folio_test_large(src))
+ gfp = GFP_TRANSHUGE;
+
/*
- * if !vma, alloc_page_vma() will use task or system default policy
+ * if !vma, vma_alloc_folio() will use task or system default policy
*/
- return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
- vma, address);
+ dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
+ folio_test_large(src));
+ return &dst->page;
}
#else
@@ -2227,6 +2224,19 @@ out:
}
EXPORT_SYMBOL(alloc_pages_vma);
+struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, bool hugepage)
+{
+ struct folio *folio;
+
+ folio = (struct folio *)alloc_pages_vma(gfp, order, vma, addr,
+ hugepage);
+ if (folio && order > 1)
+ prep_transhuge_page(&folio->page);
+
+ return folio;
+}
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
@@ -2733,6 +2743,7 @@ alloc_new:
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index de175e2fdba5..6c31ee1e1c9b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1520,10 +1520,11 @@ out:
struct page *alloc_migration_target(struct page *page, unsigned long private)
{
+ struct folio *folio = page_folio(page);
struct migration_target_control *mtc;
gfp_t gfp_mask;
unsigned int order = 0;
- struct page *new_page = NULL;
+ struct folio *new_folio = NULL;
int nid;
int zidx;
@@ -1531,34 +1532,31 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
gfp_mask = mtc->gfp_mask;
nid = mtc->nid;
if (nid == NUMA_NO_NODE)
- nid = page_to_nid(page);
+ nid = folio_nid(folio);
- if (PageHuge(page)) {
- struct hstate *h = page_hstate(compound_head(page));
+ if (folio_test_hugetlb(folio)) {
+ struct hstate *h = page_hstate(&folio->page);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
}
- if (PageTransHuge(page)) {
+ if (folio_test_large(folio)) {
/*
* clear __GFP_RECLAIM to make the migration callback
* consistent with regular THP allocations.
*/
gfp_mask &= ~__GFP_RECLAIM;
gfp_mask |= GFP_TRANSHUGE;
- order = HPAGE_PMD_ORDER;
+ order = folio_order(folio);
}
- zidx = zone_idx(page_zone(page));
+ zidx = zone_idx(folio_zone(folio));
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
-
- if (new_page && PageTransHuge(new_page))
- prep_transhuge_page(new_page);
+ new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
- return new_page;
+ return &new_folio->page;
}
#ifdef CONFIG_NUMA
@@ -1999,32 +1997,20 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
unsigned long data)
{
int nid = (int) data;
- struct page *newpage;
-
- newpage = __alloc_pages_node(nid,
- (GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE | __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_NOWARN) &
- ~__GFP_RECLAIM, 0);
-
- return newpage;
-}
-
-static struct page *alloc_misplaced_dst_page_thp(struct page *page,
- unsigned long data)
-{
- int nid = (int) data;
- struct page *newpage;
-
- newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!newpage)
- goto out;
-
- prep_transhuge_page(newpage);
+ int order = compound_order(page);
+ gfp_t gfp = __GFP_THISNODE;
+ struct folio *new;
+
+ if (order > 0)
+ gfp |= GFP_TRANSHUGE_LIGHT;
+ else {
+ gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN;
+ gfp &= ~__GFP_RECLAIM;
+ }
+ new = __folio_alloc_node(gfp, order, nid);
-out:
- return newpage;
+ return &new->page;
}
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
@@ -2082,23 +2068,9 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
int nr_remaining;
unsigned int nr_succeeded;
LIST_HEAD(migratepages);
- new_page_t *new;
- bool compound;
int nr_pages = thp_nr_pages(page);
/*
- * PTE mapped THP or HugeTLB page can't reach here so the page could
- * be either base page or THP. And it must be head page if it is
- * THP.
- */
- compound = PageTransHuge(page);
-
- if (compound)
- new = alloc_misplaced_dst_page_thp;
- else
- new = alloc_misplaced_dst_page;
-
- /*
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
@@ -2118,9 +2090,9 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- MIGRATE_ASYNC, MR_NUMA_MISPLACED,
- &nr_succeeded);
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED, &nr_succeeded);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3aa839f81e63..313b57d55a63 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
return addr;
}
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr) (TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 459d195d2ff6..f45ff1b7626a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+ unsigned long seq)
+{
+ bool ret;
+
+ spin_lock(&subscriptions->lock);
+ ret = subscriptions->invalidate_seq != seq;
+ spin_unlock(&subscriptions->lock);
+ return ret;
+}
+
/**
* mmu_interval_notifier_remove - Remove a interval notifier
* @interval_sub: Interval subscription to unregister
@@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (seq)
wait_event(subscriptions->wq,
- READ_ONCE(subscriptions->invalidate_seq) != seq);
+ mmu_interval_seq_released(subscriptions, seq));
/* pairs with mmgrab in mmu_interval_notifier_insert() */
mmdrop(mm);
diff --git a/mm/mremap.c b/mm/mremap.c
index 9d76da79594d..0b93fac76851 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -486,6 +486,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
+ if (!len)
+ return 0;
+
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
@@ -944,7 +947,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return -EINTR;
vma = vma_lookup(mm, addr);
if (!vma) {
- ret = EFAULT;
+ ret = -EFAULT;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 55a9e48a7a02..9d7afc2d959e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -226,6 +226,8 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
+
/*
* vzalloc - allocate virtually contiguous memory with zero fill
*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7ec38194f8e1..49d7df39b02d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -632,7 +632,7 @@ done:
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */
+ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}
@@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
+ spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irq(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
@@ -658,22 +658,48 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
{
- /* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
- return;
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
- get_task_struct(tsk);
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
- spin_lock(&oom_reaper_lock);
+ spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ return;
+
+ get_task_struct(tsk);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -681,7 +707,7 @@ static int __init oom_init(void)
}
subsys_initcall(oom_init)
#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
@@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();
if (can_oom_reap)
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
@@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
+ queue_oom_reaper(current);
return true;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2db95780e003..0e42038382c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -128,7 +128,7 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
struct pagesets {
local_lock_t lock;
};
-static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -6131,7 +6131,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
- if (managed_zone(zone)) {
+ if (populated_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -8919,7 +8919,7 @@ void *__init alloc_large_system_hash(const char *tablename,
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags);
+ table = vmalloc_huge(size, gfp_flags);
virt = true;
if (table)
huge = is_vm_area_hugepages(table);
diff --git a/mm/page_io.c b/mm/page_io.c
index b417f000b49e..3fbdab6a940e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -51,54 +51,6 @@ void end_swap_bio_write(struct bio *bio)
bio_put(bio);
}
-static void swap_slot_free_notify(struct page *page)
-{
- struct swap_info_struct *sis;
- struct gendisk *disk;
- swp_entry_t entry;
-
- /*
- * There is no guarantee that the page is in swap cache - the software
- * suspend code (at least) uses end_swap_bio_read() against a non-
- * swapcache page. So we must check PG_swapcache before proceeding with
- * this optimization.
- */
- if (unlikely(!PageSwapCache(page)))
- return;
-
- sis = page_swap_info(page);
- if (data_race(!(sis->flags & SWP_BLKDEV)))
- return;
-
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- disk = sis->bdev->bd_disk;
- entry.val = page_private(page);
- if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
- unsigned long offset;
-
- offset = swp_offset(entry);
-
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
-}
-
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -114,7 +66,6 @@ static void end_swap_bio_read(struct bio *bio)
}
SetPageUptodate(page);
- swap_slot_free_notify(page);
out:
unlock_page(page);
WRITE_ONCE(bio->bi_private, NULL);
@@ -394,11 +345,6 @@ int swap_readpage(struct page *page, bool synchronous)
if (sis->flags & SWP_SYNCHRONOUS_IO) {
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
-
count_vm_event(PSWPIN);
goto out;
}
@@ -414,7 +360,6 @@ int swap_readpage(struct page *page, bool synchronous)
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
- bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
@@ -426,8 +371,7 @@ int swap_readpage(struct page *page, bool synchronous)
if (!READ_ONCE(bio->bi_private))
break;
- if (!bio_poll(bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..14a5cda73dee 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -163,7 +163,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return not_found(pvmw);
if (unlikely(is_vm_hugetlb_page(vma))) {
- unsigned long size = pvmw->nr_pages * PAGE_SIZE;
+ struct hstate *hstate = hstate_vma(vma);
+ unsigned long size = huge_page_size(hstate);
/* The only possible mapping was handled on last iteration */
if (pvmw->pte)
return not_found(pvmw);
@@ -173,8 +174,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm,
- pvmw->pte);
+ pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
spin_lock(pvmw->ptl);
if (!check_pte(pvmw))
return not_found(pvmw);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8e3775829513..26bf74a6b2fe 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -113,6 +113,7 @@
* ->readpage() which may be less efficient.
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -474,7 +475,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- if (mark - index < (1UL << order))
+ mark = round_up(mark, 1UL << order);
+ if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
if (err)
@@ -555,8 +557,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- unsigned long index = readahead_index(ractl);
- pgoff_t prev_index;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t expected, prev_index;
+ unsigned int order = folio ? folio_order(folio) : 0;
/*
* If the request exceeds the readahead window, allow the read to
@@ -575,8 +578,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((index == (ra->start + ra->size - ra->async_size) ||
- index == (ra->start + ra->size))) {
+ expected = round_up(ra->start + ra->size - ra->async_size,
+ 1UL << order);
+ if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -662,7 +666,7 @@ readit:
}
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
+ page_cache_ra_order(ractl, ra, order);
}
void page_cache_sync_ra(struct readahead_control *ractl,
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 098638d3b8a4..3b3cf2892b6a 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -158,6 +158,22 @@ const struct address_space_operations secretmem_aops = {
.isolate_page = secretmem_isolate_page,
};
+static int secretmem_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = d_inode(dentry);
+ unsigned int ia_valid = iattr->ia_valid;
+
+ if ((ia_valid & ATTR_SIZE) && inode->i_size)
+ return -EINVAL;
+
+ return simple_setattr(mnt_userns, dentry, iattr);
+}
+
+static const struct inode_operations secretmem_iops = {
+ .setattr = secretmem_setattr,
+};
+
static struct vfsmount *secretmem_mnt;
static struct file *secretmem_file_create(unsigned long flags)
@@ -177,6 +193,7 @@ static struct file *secretmem_file_create(unsigned long flags)
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
+ inode->i_op = &secretmem_iops;
inode->i_mapping->a_ops = &secretmem_aops;
/* pretend we are a normal file with zero size */
diff --git a/mm/shmem.c b/mm/shmem.c
index 529c9ad3e926..4b2fea33158e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2513,7 +2513,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
pgoff_t end_index;
unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
- bool got_page;
end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
@@ -2570,24 +2569,34 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
if (!offset)
mark_page_accessed(page);
- got_page = true;
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
+ ret = copy_page_to_iter(page, offset, nr, to);
+ put_page(page);
+
+ } else if (iter_is_iovec(to)) {
+ /*
+ * Copy to user tends to be so well optimized, but
+ * clear_user() not so much, that it is noticeably
+ * faster to copy the zero page instead of clearing.
+ */
+ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
} else {
- page = ZERO_PAGE(0);
- got_page = false;
+ /*
+ * But submitting the same page twice in a row to
+ * splice() - or others? - can result in confusion:
+ * so don't attempt that optimization on pipes etc.
+ */
+ ret = iov_iter_zero(nr, to);
}
- /*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
- */
- ret = copy_page_to_iter(page, offset, nr, to);
retval += ret;
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
- if (got_page)
- put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
diff --git a/mm/slab.c b/mm/slab.c
index b04e40078bdf..0edb474edef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3665,7 +3665,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
struct kmem_cache *cachep;
unsigned int objnr;
diff --git a/mm/slab.h b/mm/slab.h
index fd7ae2024897..95eb34174c1b 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -868,7 +868,7 @@ struct kmem_obj_info {
void *kp_stack[KS_ADDRS_COUNT];
void *kp_free_stack[KS_ADDRS_COUNT];
};
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
#endif
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6ee64d6208b3..2b3206a2c3b5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -555,6 +555,13 @@ bool kmem_valid_obj(void *object)
}
EXPORT_SYMBOL_GPL(kmem_valid_obj);
+static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+{
+ if (__kfence_obj_info(kpp, object, slab))
+ return;
+ __kmem_obj_info(kpp, object, slab);
+}
+
/**
* kmem_dump_obj - Print available slab provenance information
* @object: slab object for which to find provenance information.
@@ -590,6 +597,8 @@ void kmem_dump_obj(void *object)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
else
pr_cont(" slab%s", cp);
+ if (is_kfence_address(object))
+ pr_cont(" (kfence)");
if (kp.kp_objp)
pr_cont(" start %px", kp.kp_objp);
if (kp.kp_data_offset)
diff --git a/mm/slob.c b/mm/slob.c
index dfa6808dff36..40ea6e2d4ccd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -463,7 +463,7 @@ out:
}
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
kpp->kp_ptr = object;
kpp->kp_slab = slab;
diff --git a/mm/slub.c b/mm/slub.c
index 74d92aa4a3a2..ed5c2c03a47a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4312,7 +4312,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
}
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
+void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
void *base;
int __maybe_unused i;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63c61f8b2611..981a6e85c88e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
* Swap reorganised 29.12.95, Stephen Tweedie
*/
+#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
@@ -179,7 +180,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
return err;
cond_resched();
@@ -190,7 +191,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
break;
@@ -254,7 +255,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
+ nr_blocks, GFP_NOIO))
break;
se = next_se(se);
@@ -2466,7 +2467,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ if (!p->bdev || !bdev_nonrot(p->bdev))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -2761,7 +2762,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
+ if (bdev_is_zoned(p->bdev))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
@@ -2957,20 +2958,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
return nr_extents;
}
-/*
- * Helper to sys_swapon determining if a given swap
- * backing device queue supports DISCARD operations.
- */
-static bool swap_discardable(struct swap_info_struct *si)
-{
- struct request_queue *q = bdev_get_queue(si->bdev);
-
- if (!blk_queue_discard(q))
- return false;
-
- return true;
-}
-
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
@@ -3065,13 +3052,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
+ if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;
if (p->bdev && p->bdev->bd_disk->fops->rw_page)
p->flags |= SWP_SYNCHRONOUS_IO;
- if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ if (p->bdev && bdev_nonrot(p->bdev)) {
int cpu;
unsigned long ci, nr_cluster;
@@ -3132,7 +3119,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if ((swap_flags & SWAP_FLAG_DISCARD) &&
+ p->bdev && bdev_max_discard_sectors(p->bdev)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0cb8e5ef1713..e9bb6db002aa 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
- if (writable) {
- if (wp_copy)
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- else
- _dst_pte = pte_mkwrite(_dst_pte);
- }
+
+ /*
+ * Always mark a PTE as write-protected when needed, regardless of
+ * VM_WRITE, which the user might change.
+ */
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else if (writable)
+ _dst_pte = pte_mkwrite(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
diff --git a/mm/util.c b/mm/util.c
index 54e5e761a9a9..3492a9e81aa3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -592,8 +592,15 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
return NULL;
}
- return __vmalloc_node(size, 1, flags, node,
- __builtin_return_address(0));
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e163372d3967..cadfbb5155ea 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,17 +1671,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-#ifdef CONFIG_X86_64
-/*
- * called before a call to iounmap() if the caller wants vm_area_struct's
- * immediately freed.
- */
-void set_iounmap_nonlazy(void)
-{
- atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
-}
-#endif /* CONFIG_X86_64 */
-
/*
* Purges all lazily-freed vmap areas.
*/
@@ -2664,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
- unsigned int page_order = vm_area_page_order(area);
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step) {
+ for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
- __free_pages(page, page_order);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
cond_resched();
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2925,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
- } else
- /*
- * Compound pages required for remap_vmalloc_page if
- * high-order pages.
- */
- gfp |= __GFP_COMP;
+ }
/* High-order pages or fallback path if "bulk" fails. */
@@ -2944,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (order)
+ split_page(page, order);
/*
* Careful, we allocate and map page-order pages, but
@@ -3003,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
- step);
+ for (i = 0; i < area->nr_pages; i++)
+ mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
}
/*
@@ -3106,7 +3101,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
}
- if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
+ if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
unsigned long size_per_node;
/*
@@ -3273,21 +3268,24 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vmalloc_no_huge - allocate virtually contiguous memory using small pages
- * @size: allocation size
+ * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
*
- * Allocate enough non-huge pages to cover @size from the page level
+ * Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
+ * If @size is greater than or equal to PMD_SIZE, allow using
+ * huge pages for the memory
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_no_huge(unsigned long size)
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_no_huge);
+EXPORT_SYMBOL_GPL(vmalloc_huge);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
diff --git a/mm/workingset.c b/mm/workingset.c
index 8a3828acc0bf..592569a8974c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow)
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats_delayed();
/*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if