summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile3
-rw-r--r--mm/bootmem_info.c25
-rw-r--r--mm/compaction.c11
-rw-r--r--mm/damon/core.c655
-rw-r--r--mm/damon/lru_sort.c59
-rw-r--r--mm/damon/ops-common.c9
-rw-r--r--mm/damon/paddr.c84
-rw-r--r--mm/damon/reclaim.c98
-rw-r--r--mm/damon/stat.c92
-rw-r--r--mm/damon/sysfs-common.c41
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c289
-rw-r--r--mm/damon/sysfs.c588
-rw-r--r--mm/damon/tests/core-kunit.h180
-rw-r--r--mm/damon/tests/vaddr-kunit.h27
-rw-r--r--mm/damon/vaddr.c79
-rw-r--r--mm/filemap.c133
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c479
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/hugetlb_cma.c35
-rw-r--r--mm/internal.h27
-rw-r--r--mm/kasan/kasan_test_c.c10
-rw-r--r--mm/kfence/kfence_test.c2
-rw-r--r--mm/khugepaged.c21
-rw-r--r--mm/kmemleak.c148
-rw-r--r--mm/list_lru.c238
-rw-r--r--mm/madvise.c60
-rw-r--r--mm/memcontrol-v1.c68
-rw-r--r--mm/memcontrol.c294
-rw-r--r--mm/memory-failure.c47
-rw-r--r--mm/memory.c206
-rw-r--r--mm/memory_hotplug.c34
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/memremap.c4
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/migrate_device.c9
-rw-r--r--mm/mm_init.c73
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mseal.c5
-rw-r--r--mm/page_alloc.c383
-rw-r--r--mm/page_io.c103
-rw-r--r--mm/page_isolation.c67
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_vma_mapped.c9
-rw-r--r--mm/percpu-internal.h6
-rw-r--r--mm/readahead.c36
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c278
-rw-r--r--mm/shrinker.c18
-rw-r--r--mm/sparse-vmemmap.c82
-rw-r--r--mm/sparse.c68
-rw-r--r--mm/swap.c57
-rw-r--r--mm/swap.h90
-rw-r--r--mm/swap_cgroup.c172
-rw-r--r--mm/swap_state.c533
-rw-r--r--mm/swap_table.h179
-rw-r--r--mm/swapfile.c460
-rw-r--r--mm/userfaultfd.c2259
-rw-r--r--mm/util.c32
-rw-r--r--mm/vma.c3
-rw-r--r--mm/vmalloc.c130
-rw-r--r--mm/vmpressure.c15
-rw-r--r--mm/vmscan.c370
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zswap.c25
67 files changed, 7068 insertions, 2551 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4f187b07eb48..fe734d9bbe99 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -590,7 +590,7 @@ endchoice
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
+ select HAVE_BOOTMEM_INFO_NODE if X86_64
depends on MEMORY_HOTPLUG
select MIGRATION
@@ -863,7 +863,6 @@ if TRANSPARENT_HUGEPAGE
choice
prompt "Transparent Hugepage Support sysfs defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_ALWAYS
help
Selects the sysfs defaults for Transparent Hugepage Support.
@@ -893,7 +892,6 @@ endchoice
choice
prompt "Shmem hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -939,7 +937,6 @@ endchoice
choice
prompt "Tmpfs hugepage allocation defaults"
- depends on TRANSPARENT_HUGEPAGE
default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER
help
Selects the hugepage allocation policy defaults for
@@ -984,7 +981,7 @@ endchoice
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
+ depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now, swap cluster backing transparent huge page
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..eff9f9e7e061 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,9 +103,6 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_LIVEUPDATE_MEMFD) += memfd_luo.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
-ifdef CONFIG_SWAP
-obj-$(CONFIG_MEMCG) += swap_cgroup.o
-endif
ifdef CONFIG_BPF_SYSCALL
obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
endif
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index 3d7675a3ae04..0fa78db7fbc0 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -19,7 +19,6 @@ void get_page_bootmem(unsigned long info, struct page *page,
{
BUG_ON(type > 0xf);
BUG_ON(info > (ULONG_MAX >> 4));
- SetPagePrivate(page);
set_page_private(page, info << 4 | type);
page_ref_inc(page);
}
@@ -32,20 +31,15 @@ void put_page_bootmem(struct page *page)
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
- ClearPagePrivate(page);
set_page_private(page, 0);
- INIT_LIST_HEAD(&page->lru);
- kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
free_reserved_page(page);
}
}
static void __init register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long mapsize, section_nr, i;
+ unsigned long section_nr;
struct mem_section *ms;
- struct mem_section_usage *usage;
- struct page *page;
start_pfn = SECTION_ALIGN_DOWN(start_pfn);
section_nr = pfn_to_section_nr(start_pfn);
@@ -54,27 +48,12 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn)
if (!preinited_vmemmap_section(ms))
register_page_bootmem_memmap(section_nr, pfn_to_page(start_pfn),
PAGES_PER_SECTION);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
{
- unsigned long i, pfn, end_pfn, nr_pages;
+ unsigned long pfn, end_pfn;
int node = pgdat->node_id;
- struct page *page;
-
- nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
- page = virt_to_page(pgdat);
-
- for (i = 0; i < nr_pages; i++, page++)
- get_page_bootmem(node, page, NODE_INFO);
pfn = pgdat->node_start_pfn;
end_pfn = pgdat_end_pfn(pgdat);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3648ce22c807..b776f35ad020 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1123,7 +1123,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* To minimise LRU disruption, the caller can indicate with
* ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
* it will be able to migrate without blocking - clean pages
- * for the most part. PageWriteback would require blocking.
+ * for the most part. Writeback would require blocking.
*/
if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
@@ -2340,7 +2340,8 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype, true) >= 0)
+ if (find_suitable_fallback(area, order, migratetype, true, NULL)
+ == FALLBACK_FOUND)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
@@ -2447,7 +2448,7 @@ bool compaction_suitable(struct zone *zone, int order, unsigned long watermark,
/* Used by direct reclaimers */
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
- int alloc_flags)
+ int alloc_flags, gfp_t gfp_mask)
{
struct zone *zone;
struct zoneref *z;
@@ -2460,6 +2461,10 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
+ if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+
/*
* Do not consider all the reclaimable memory because we do not
* want to trash just for a single high order allocation which
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3dbbbfdeff71..265d51ade25b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -13,10 +13,14 @@
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/psi.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_choices.h>
+/* for damon_get_folio() used by node eligible memory metrics */
+#include "ops-common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -109,6 +113,103 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
return err;
}
+struct damon_filter *damon_new_filter(enum damon_filter_type type,
+ bool matching, bool allow)
+{
+ struct damon_filter *filter;
+
+ filter = kmalloc_obj(*filter);
+ if (!filter)
+ return NULL;
+ filter->type = type;
+ filter->matching = matching;
+ filter->allow = allow;
+ INIT_LIST_HEAD(&filter->list);
+ return filter;
+}
+
+void damon_add_filter(struct damon_probe *p, struct damon_filter *f)
+{
+ list_add_tail(&f->list, &p->filters);
+}
+
+static void damon_del_filter(struct damon_filter *f)
+{
+ list_del(&f->list);
+}
+
+static void damon_free_filter(struct damon_filter *f)
+{
+ kfree(f);
+}
+
+void damon_destroy_filter(struct damon_filter *f)
+{
+ damon_del_filter(f);
+ damon_free_filter(f);
+}
+
+static struct damon_filter *damon_nth_filter(int n, struct damon_probe *p)
+{
+ struct damon_filter *f;
+ int i = 0;
+
+ damon_for_each_filter(f, p) {
+ if (i++ == n)
+ return f;
+ }
+ return NULL;
+}
+
+struct damon_probe *damon_new_probe(void)
+{
+ struct damon_probe *p;
+
+ p = kmalloc_obj(*p);
+ if (!p)
+ return NULL;
+ INIT_LIST_HEAD(&p->filters);
+ INIT_LIST_HEAD(&p->list);
+ return p;
+}
+
+void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe)
+{
+ list_add_tail(&probe->list, &ctx->probes);
+}
+
+static void damon_del_probe(struct damon_probe *p)
+{
+ list_del(&p->list);
+}
+
+static void damon_free_probe(struct damon_probe *p)
+{
+ struct damon_filter *f, *next;
+
+ damon_for_each_filter_safe(f, next, p)
+ damon_free_filter(f);
+ kfree(p);
+}
+
+static void damon_destroy_probe(struct damon_probe *p)
+{
+ damon_del_probe(p);
+ damon_free_probe(p);
+}
+
+static struct damon_probe *damon_nth_probe(int n, struct damon_ctx *ctx)
+{
+ struct damon_probe *p;
+ int i = 0;
+
+ damon_for_each_probe(p, ctx) {
+ if (i++ == n)
+ return p;
+ }
+ return NULL;
+}
+
#ifdef CONFIG_DAMON_DEBUG_SANITY
static void damon_verify_new_region(unsigned long start, unsigned long end)
{
@@ -128,6 +229,7 @@ static void damon_verify_new_region(unsigned long start, unsigned long end)
struct damon_region *damon_new_region(unsigned long start, unsigned long end)
{
struct damon_region *region;
+ int i;
damon_verify_new_region(start, end);
region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
@@ -138,6 +240,8 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
region->ar.end = end;
region->nr_accesses = 0;
region->nr_accesses_bp = 0;
+ for (i = 0; i < DAMON_MAX_PROBES; i++)
+ region->probe_hits[i] = 0;
INIT_LIST_HEAD(&region->list);
region->age = 0;
@@ -146,12 +250,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
return region;
}
-void damon_add_region(struct damon_region *r, struct damon_target *t)
+static void damon_add_region(struct damon_region *r, struct damon_target *t)
{
list_add_tail(&r->list, &t->regions_list);
t->nr_regions++;
}
+/*
+ * Add a region between two other regions
+ */
+static inline void damon_insert_region(struct damon_region *r,
+ struct damon_region *prev, struct damon_region *next,
+ struct damon_target *t)
+{
+ __list_add(&r->list, &prev->list, &next->list);
+ t->nr_regions++;
+}
+
#ifdef CONFIG_DAMON_DEBUG_SANITY
static void damon_verify_del_region(struct damon_target *t)
{
@@ -176,7 +291,8 @@ static void damon_free_region(struct damon_region *r)
kmem_cache_free(damon_region_cache, r);
}
-void damon_destroy_region(struct damon_region *r, struct damon_target *t)
+static void damon_destroy_region(struct damon_region *r,
+ struct damon_target *t)
{
damon_del_region(r, t);
damon_free_region(r);
@@ -252,11 +368,25 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
damon_destroy_region(r, t);
}
+ if (!damon_nr_regions(t)) {
+ for (i = 0; i < nr_ranges; i++) {
+ r = damon_new_region(
+ ALIGN_DOWN(ranges[i].start,
+ min_region_sz),
+ ALIGN(ranges[i].end, min_region_sz));
+ if (!r)
+ return -ENOMEM;
+ damon_add_region(r, t);
+ }
+ return 0;
+ }
+
r = damon_first_region(t);
/* Add new regions or resize existing regions to fit in the ranges */
for (i = 0; i < nr_ranges; i++) {
struct damon_region *first = NULL, *last, *newr;
struct damon_addr_range *range;
+ bool insert_before_r = false;
range = &ranges[i];
/* Get the first/last regions intersecting with the range */
@@ -266,8 +396,10 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
first = r;
last = r;
}
- if (r->ar.start >= range->end)
+ if (r->ar.start >= range->end) {
+ insert_before_r = true;
break;
+ }
}
if (!first) {
/* no region intersects with this range */
@@ -277,7 +409,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
ALIGN(range->end, min_region_sz));
if (!newr)
return -ENOMEM;
- damon_insert_region(newr, damon_prev_region(r), r, t);
+ if (insert_before_r)
+ damon_insert_region(newr, damon_prev_region(r),
+ r, t);
+ else
+ damon_add_region(newr, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
@@ -550,27 +686,8 @@ void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx)
damon_free_target(t);
}
-#ifdef CONFIG_DAMON_DEBUG_SANITY
-static void damon_verify_nr_regions(struct damon_target *t)
-{
- struct damon_region *r;
- unsigned int count = 0;
-
- damon_for_each_region(r, t)
- count++;
- WARN_ONCE(count != t->nr_regions, "t->nr_regions (%u) != count (%u)\n",
- t->nr_regions, count);
-}
-#else
-static void damon_verify_nr_regions(struct damon_target *t)
-{
-}
-#endif
-
unsigned int damon_nr_regions(struct damon_target *t)
{
- damon_verify_nr_regions(t);
-
return t->nr_regions;
}
@@ -601,12 +718,16 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
+ INIT_LIST_HEAD(&ctx->probes);
+
ctx->addr_unit = 1;
ctx->min_region_sz = DAMON_MIN_REGION_SZ;
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
+ prandom_seed_state(&ctx->rnd_state, get_random_u64());
+
return ctx;
}
@@ -621,12 +742,16 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
void damon_destroy_ctx(struct damon_ctx *ctx)
{
struct damos *s, *next_s;
+ struct damon_probe *p, *next_p;
damon_destroy_targets(ctx);
damon_for_each_scheme_safe(s, next_s, ctx)
damon_destroy_scheme(s);
+ damon_for_each_probe_safe(p, next_p, ctx)
+ damon_destroy_probe(p);
+
kfree(ctx);
}
@@ -797,6 +922,9 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
attrs->aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->passed_sample_intervals +
attrs->ops_update_interval / sample_interval;
+ /*
+ * next_intervals_tune_sis will be updated inside kdamond_fn().
+ */
damon_update_monitoring_results(ctx, attrs, aggregating);
ctx->attrs = *attrs;
@@ -918,6 +1046,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src)
if (err)
return err;
dst->goal_tuner = src->goal_tuner;
+ dst->fail_charge_num = src->fail_charge_num;
+ dst->fail_charge_denom = src->fail_charge_denom;
dst->weight_sz = src->weight_sz;
dst->weight_nr_accesses = src->weight_nr_accesses;
dst->weight_age = src->weight_age;
@@ -1310,6 +1440,86 @@ static int damon_commit_targets(
return 0;
}
+static void damon_commit_filter(struct damon_filter *dst,
+ struct damon_filter *src)
+{
+ dst->type = src->type;
+ dst->matching = src->matching;
+ dst->allow = src->allow;
+ switch (dst->type) {
+ case DAMON_FILTER_TYPE_MEMCG:
+ dst->memcg_id = src->memcg_id;
+ break;
+ default:
+ break;
+ }
+}
+
+static int damon_commit_filters(struct damon_probe *dst,
+ struct damon_probe *src)
+{
+ struct damon_filter *dst_filter, *next, *src_filter, *new_filter;
+ int i = 0, j = 0;
+
+ damon_for_each_filter_safe(dst_filter, next, dst) {
+ src_filter = damon_nth_filter(i++, src);
+ if (src_filter)
+ damon_commit_filter(dst_filter, src_filter);
+ else
+ damon_destroy_filter(dst_filter);
+ }
+
+ damon_for_each_filter_safe(src_filter, next, src) {
+ if (j++ < i)
+ continue;
+
+ new_filter = damon_new_filter(src_filter->type,
+ src_filter->matching, src_filter->allow);
+ if (!new_filter)
+ return -ENOMEM;
+ switch (src_filter->type) {
+ case DAMON_FILTER_TYPE_MEMCG:
+ new_filter->memcg_id = src_filter->memcg_id;
+ break;
+ default:
+ break;
+ }
+ damon_add_filter(dst, new_filter);
+ }
+ return 0;
+}
+
+static int damon_commit_probes(struct damon_ctx *dst, struct damon_ctx *src)
+{
+ struct damon_probe *dst_probe, *next, *src_probe, *new_probe;
+ int i = 0, j = 0, err;
+
+ damon_for_each_probe_safe(dst_probe, next, dst) {
+ src_probe = damon_nth_probe(i++, src);
+ if (src_probe) {
+ err = damon_commit_filters(dst_probe, src_probe);
+ if (err)
+ return err;
+ } else {
+ damon_destroy_probe(dst_probe);
+ }
+ }
+
+ damon_for_each_probe_safe(src_probe, next, src) {
+ if (j++ < i)
+ continue;
+
+ new_probe = damon_new_probe();
+ if (!new_probe)
+ return -ENOMEM;
+ damon_add_probe(dst, new_probe);
+ err = damon_commit_filters(new_probe, src_probe);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
/**
* damon_commit_ctx() - Commit parameters of a DAMON context to another.
* @dst: The commit destination DAMON context.
@@ -1326,11 +1536,26 @@ static int damon_commit_targets(
int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
{
int err;
+ struct damos *scheme;
+ struct damos_quota_goal *goal;
dst->maybe_corrupted = true;
if (!is_power_of_2(src->min_region_sz))
return -EINVAL;
+ /* node_eligible_mem_bp metric requires PADDR ops */
+ if (src->ops.id != DAMON_OPS_PADDR) {
+ damon_for_each_scheme(scheme, src) {
+ struct damos_quota *quota = &scheme->quota;
+
+ damos_for_each_quota_goal(goal, quota) {
+ if (goal->metric ==
+ DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP)
+ return -EINVAL;
+ }
+ }
+ }
+
err = damon_commit_schemes(dst, src);
if (err)
return err;
@@ -1349,7 +1574,11 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
if (err)
return err;
}
+ dst->pause = src->pause;
dst->ops = src->ops;
+ err = damon_commit_probes(dst, src);
+ if (err)
+ return err;
dst->addr_unit = src->addr_unit;
dst->min_region_sz = src->min_region_sz;
@@ -1706,15 +1935,28 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
{
struct damon_target *t;
unsigned int ti = 0; /* target's index */
+ unsigned int nr_probes = 0;
+ struct damon_probe *probe;
+
+ if (trace_damon_region_aggregated_enabled()) {
+ damon_for_each_probe(probe, c)
+ nr_probes++;
+ }
damon_for_each_target(t, c) {
struct damon_region *r;
damon_for_each_region(r, t) {
+ int i;
+
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ trace_damon_region_aggregated(ti, r,
+ damon_nr_regions(t), nr_probes);
damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
+ for (i = 0; i < DAMON_MAX_PROBES; i++)
+ r->probe_hits[i] = 0;
damon_verify_reset_aggregated(r, c);
}
ti++;
@@ -2046,6 +2288,37 @@ static void damos_walk_cancel(struct damon_ctx *ctx)
mutex_unlock(&ctx->walk_control_lock);
}
+static void damos_charge_quota(struct damos_quota *quota,
+ unsigned long sz_region, unsigned long sz_applied)
+{
+ /*
+ * sz_applied could be bigger than sz_region, depending on ops
+ * implementation of the action, e.g., damos_pa_pageout(). Charge only
+ * the region size in the case.
+ */
+ if (!quota->fail_charge_denom || sz_applied > sz_region)
+ quota->charged_sz += sz_region;
+ else
+ quota->charged_sz += sz_applied + mult_frac(
+ (sz_region - sz_applied),
+ quota->fail_charge_num,
+ quota->fail_charge_denom);
+}
+
+static bool damos_quota_is_full(struct damos_quota *quota,
+ unsigned long min_region_sz)
+{
+ if (!damos_quota_is_set(quota))
+ return false;
+ if (quota->charged_sz >= quota->esz)
+ return true;
+ /*
+ * DAMOS action is applied per region, so <min_region_sz remaining
+ * quota means the quota is effectively full.
+ */
+ return quota->esz - quota->charged_sz < min_region_sz;
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -2102,11 +2375,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
- quota->charged_sz += sz;
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz) {
+ damos_charge_quota(quota, sz, sz_applied);
+ if (damos_quota_is_full(quota, c->min_region_sz)) {
quota->charge_target_from = t;
- quota->charge_addr_from = r->ar.end + 1;
+ quota->charge_addr_from = r->ar.end;
}
}
if (s->action != DAMOS_STAT)
@@ -2132,8 +2404,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
continue;
/* Check the quota */
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
continue;
if (damos_skip_charged_region(t, r, s, c->min_region_sz))
@@ -2152,6 +2423,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
}
/*
+ * damos_apply_target() - Apply DAMOS schemes to a given target.
+ * @c: monitoring context to apply its DAMOS schemes to..
+ * @t: monitoring target to apply the schemes to.
+ * @max_region_sz: maximum region size for @c.
+ *
+ * This function could split regions for keeping the quota. To minimize
+ * overhead from the split operations increased number of regions, this
+ * function will also merge regions after the schemes applying attempt is done,
+ * for each region. The merge operation is made only when it doesn't lose the
+ * monitoring information and not violating @max_region_sz.
+ *
+ * Hence, after this function is called, the total number of regions could
+ * be increased or reduced. The increase could make max_nr_regions temporarily
+ * be violated, until the next per-aggregation interval regions merge operation
+ * is executed. The decrease will not violate min_nr_regions though, since it
+ * keeps @max_region_sz.
+ */
+static void damos_apply_target(struct damon_ctx *c, struct damon_target *t,
+ unsigned long max_region_sz)
+{
+ struct damon_region *r;
+
+ damon_for_each_region(r, t) {
+ struct damon_region *prev_r;
+
+ damon_do_apply_schemes(c, t, r);
+ /*
+ * damon_do_apply_scheems() could split the region for the
+ * quota. Keeping the new slices is an overhead. Merge back
+ * the slices into the previous region if it doesn't lose any
+ * information and not violating the max_region_sz.
+ */
+ if (damon_first_region(t) == r)
+ continue;
+ prev_r = damon_prev_region(r);
+ if (prev_r->ar.end != r->ar.start)
+ continue;
+ if (prev_r->age != r->age)
+ continue;
+ if (prev_r->last_nr_accesses != r->last_nr_accesses)
+ continue;
+ if (prev_r->nr_accesses != r->nr_accesses)
+ continue;
+ if (r->ar.end - prev_r->ar.start > max_region_sz)
+ continue;
+ prev_r->ar.end = r->ar.end;
+ damon_destroy_region(r, t);
+ r = prev_r;
+ }
+}
+
+/*
* damon_feed_loop_next_input() - get next input to achieve a target score.
* @last_input The last input.
* @score Current score that made with @last_input.
@@ -2287,7 +2610,115 @@ static unsigned long damos_get_node_memcg_used_bp(
numerator = i.totalram - used_pages;
return mult_frac(numerator, 10000, i.totalram);
}
-#else
+
+#ifdef CONFIG_DAMON_PADDR
+/*
+ * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node.
+ * @c: The DAMON context.
+ * @s: The scheme.
+ * @nid: The target NUMA node id.
+ * @total: Output for total eligible bytes across all nodes.
+ *
+ * Iterates through each folio in eligible regions to accurately determine
+ * which node the memory resides on. Returns eligible bytes on the specified
+ * node and sets *total to the sum across all nodes.
+ *
+ * Note: This function requires damon_get_folio() from ops-common.c, which is
+ * only available when CONFIG_DAMON_PADDR is enabled. It also requires the
+ * context to be using PADDR operations for meaningful results.
+ */
+static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c,
+ struct damos *s, int nid, phys_addr_t *total)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible = 0;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ phys_addr_t addr, end_addr;
+
+ if (!__damos_valid_target(r, s))
+ continue;
+
+ /* Convert from core address units to physical bytes */
+ addr = (phys_addr_t)r->ar.start * c->addr_unit;
+ end_addr = (phys_addr_t)r->ar.end * c->addr_unit;
+ while (addr < end_addr) {
+ struct folio *folio;
+ phys_addr_t folio_start, folio_end;
+ phys_addr_t overlap_start, overlap_end;
+ phys_addr_t counted;
+
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (!folio) {
+ addr = PAGE_ALIGN_DOWN(addr +
+ PAGE_SIZE);
+ if (!addr)
+ break;
+ continue;
+ }
+
+ /*
+ * Calculate exact overlap between the region
+ * [addr, end_addr) and the folio range.
+ * The folio may start before addr if addr is
+ * in the middle of a large folio.
+ */
+ folio_start = PFN_PHYS(folio_pfn(folio));
+ folio_end = folio_start + folio_size(folio);
+
+ overlap_start = max(addr, folio_start);
+ overlap_end = min(end_addr, folio_end);
+
+ if (overlap_end > overlap_start) {
+ counted = overlap_end - overlap_start;
+ total_eligible += counted;
+ if (folio_nid(folio) == nid)
+ node_eligible += counted;
+ }
+
+ /* Advance past the entire folio */
+ addr = folio_end;
+ folio_put(folio);
+ }
+ cond_resched();
+ }
+ }
+
+ *total = total_eligible;
+ return node_eligible;
+}
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ phys_addr_t total_eligible = 0;
+ phys_addr_t node_eligible;
+
+ if (c->ops.id != DAMON_OPS_PADDR)
+ return 0;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid))
+ return 0;
+
+ node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible);
+
+ if (!(unsigned long)total_eligible)
+ return 0;
+
+ return mult_frac((unsigned long)node_eligible, 10000,
+ (unsigned long)total_eligible);
+}
+#else /* CONFIG_DAMON_PADDR */
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_DAMON_PADDR */
+#else /* CONFIG_NUMA */
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
@@ -2299,7 +2730,13 @@ static unsigned long damos_get_node_memcg_used_bp(
{
return 0;
}
-#endif
+
+static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c,
+ struct damos *s, int nid)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
/*
* Returns LRU-active or inactive memory to total LRU memory size ratio.
@@ -2319,7 +2756,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio)
return mult_frac(inactive, 10000, total);
}
-static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+static void damos_set_quota_goal_current_value(struct damon_ctx *c,
+ struct damos *s, struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -2345,19 +2783,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = damos_get_in_active_mem_bp(
goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP);
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->current_value = damos_get_node_eligible_mem_bp(c, s,
+ goal->nid);
+ break;
default:
break;
}
}
/* Return the highest score since it makes schemes least aggressive */
-static unsigned long damos_quota_score(struct damos_quota *quota)
+static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s)
{
struct damos_quota_goal *goal;
+ struct damos_quota *quota = &s->quota;
unsigned long highest_score = 0;
damos_for_each_quota_goal(goal, quota) {
- damos_set_quota_goal_current_value(goal);
+ damos_set_quota_goal_current_value(c, s, goal);
highest_score = max(highest_score,
mult_frac(goal->current_value, 10000,
goal->target_value));
@@ -2366,17 +2809,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
return highest_score;
}
-static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
quota->esz_bp = damon_feed_loop_next_input(
max(quota->esz_bp, 10000UL), score);
}
-static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
+static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c,
+ struct damos *s)
{
- unsigned long score = damos_quota_score(quota);
+ struct damos_quota *quota = &s->quota;
+ unsigned long score = damos_quota_score(c, s);
if (score >= 10000)
quota->esz_bp = 0;
@@ -2389,9 +2835,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
/*
* Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
*/
-static void damos_set_effective_quota(struct damos_quota *quota,
- struct damon_ctx *ctx)
+static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
unsigned long throughput;
unsigned long esz = ULONG_MAX;
@@ -2402,9 +2848,9 @@ static void damos_set_effective_quota(struct damos_quota *quota,
if (!list_empty(&quota->goals)) {
if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST)
- damos_goal_tune_esz_bp_consist(quota);
+ damos_goal_tune_esz_bp_consist(ctx, s);
else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL)
- damos_goal_tune_esz_bp_temporal(quota);
+ damos_goal_tune_esz_bp_temporal(ctx, s);
esz = quota->esz_bp / 10000;
}
@@ -2452,22 +2898,23 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* First charge window */
if (!quota->total_charged_sz && !quota->charged_from) {
quota->charged_from = jiffies;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
+ if (trace_damos_esz_enabled())
+ damos_trace_esz(c, s, quota);
}
/* New charge window starts */
if (!time_in_range_open(jiffies, quota->charged_from,
quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
- if (damos_quota_is_set(quota) &&
- quota->charged_sz >= quota->esz)
+ if (damos_quota_is_full(quota, c->min_region_sz))
s->stat.qt_exceeds++;
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
if (trace_damos_esz_enabled())
cached_esz = quota->esz;
- damos_set_effective_quota(quota, c);
+ damos_set_effective_quota(c, s);
if (trace_damos_esz_enabled() && quota->esz != cached_esz)
damos_trace_esz(c, s, quota);
}
@@ -2521,9 +2968,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s)
static void kdamond_apply_schemes(struct damon_ctx *c)
{
struct damon_target *t;
- struct damon_region *r;
struct damos *s;
bool has_schemes_to_apply = false;
+ unsigned long max_region_sz;
damon_for_each_scheme(s, c) {
if (time_before(c->passed_sample_intervals, s->next_apply_sis))
@@ -2540,13 +2987,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ max_region_sz = damon_region_sz_limit(c);
mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
if (c->ops.target_valid && c->ops.target_valid(t) == false)
continue;
-
- damon_for_each_region(r, t)
- damon_do_apply_schemes(c, t, r);
+ damos_apply_target(c, t, max_region_sz);
}
damon_for_each_scheme(s, c) {
@@ -2582,12 +3028,17 @@ static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
+ int i;
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
l->nr_accesses_bp = l->nr_accesses * 10000;
l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
l->ar.end = r->ar.end;
+ /* todo: do this for only installed probes */
+ for (i = 0; i < DAMON_MAX_PROBES; i++)
+ l->probe_hits[i] = (l->probe_hits[i] * sz_l + r->probe_hits[i]
+ * sz_r) / (sz_l + sz_r);
damon_verify_merge_two_regions(l, r);
damon_destroy_region(r, t);
}
@@ -2710,13 +3161,16 @@ static void damon_split_region_at(struct damon_target *t,
new->last_nr_accesses = r->last_nr_accesses;
new->nr_accesses_bp = r->nr_accesses_bp;
new->nr_accesses = r->nr_accesses;
+ /* todo: do this for only installed probes */
+ memcpy(new->probe_hits, r->probe_hits, sizeof(r->probe_hits));
damon_insert_region(new, r, damon_next_region(r), t);
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_target *t, int nr_subs,
- unsigned long min_region_sz)
+static void damon_split_regions_of(struct damon_ctx *ctx,
+ struct damon_target *t, int nr_subs,
+ unsigned long min_region_sz)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2731,7 +3185,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs,
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
- sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
+ sz_sub = ALIGN_DOWN(damon_rand(ctx, 1, 10) *
sz_region / 10, min_region_sz);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
@@ -2772,7 +3226,8 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions, ctx->min_region_sz);
+ damon_split_regions_of(ctx, t, nr_subregions,
+ ctx->min_region_sz);
last_nr_regions = nr_regions;
}
@@ -2857,6 +3312,37 @@ static void kdamond_usleep(unsigned long usecs)
usleep_range_idle(usecs, usecs + 1);
}
+#ifdef CONFIG_DAMON_DEBUG_SANITY
+static void damon_verify_ctx(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+
+ damon_for_each_target(t, c) {
+ struct damon_region *prev_r = NULL;
+ unsigned int nr_regions = 0;
+
+ damon_for_each_region(r, t) {
+ WARN_ONCE(r->ar.start >= r->ar.end,
+ "region start (%lu) >= end (%lu)\n",
+ r->ar.start, r->ar.end);
+ WARN_ONCE(prev_r && prev_r->ar.end > r->ar.start,
+ "region overlap (%lu > %lu)\n",
+ prev_r->ar.end, r->ar.start);
+ prev_r = r;
+ nr_regions++;
+ }
+ WARN_ONCE(damon_nr_regions(t) != nr_regions,
+ "nr_regions mismatch: %u != %u\n",
+ damon_nr_regions(t), nr_regions);
+ }
+}
+#else
+static void damon_verify_ctx(struct damon_ctx *c)
+{
+}
+#endif
+
/*
* kdamond_call() - handle damon_call_control objects.
* @ctx: The &struct damon_ctx of the kdamond.
@@ -2872,6 +3358,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
struct damon_call_control *control, *next;
LIST_HEAD(controls);
+ damon_verify_ctx(ctx);
+
mutex_lock(&ctx->call_controls_lock);
list_splice_tail_init(&ctx->call_controls, &controls);
mutex_unlock(&ctx->call_controls_lock);
@@ -2997,6 +3485,8 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
+ if (ctx->ops.apply_probes)
+ ctx->ops.apply_probes(ctx);
if (time_after_eq(ctx->passed_sample_intervals,
next_aggregation_sis)) {
@@ -3014,6 +3504,14 @@ static int kdamond_fn(void *data)
kdamond_call(ctx, false);
if (ctx->maybe_corrupted)
break;
+ while (ctx->pause) {
+ damos_walk_cancel(ctx);
+ kdamond_usleep(ctx->attrs.sample_interval);
+ /* allow caller unset pause via damon_call() */
+ kdamond_call(ctx, false);
+ if (kdamond_need_stop(ctx) || ctx->maybe_corrupted)
+ goto done;
+ }
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
else
@@ -3096,14 +3594,20 @@ done:
return 0;
}
-static int walk_system_ram(struct resource *res, void *arg)
+struct damon_system_ram_range_walk_arg {
+ bool walked;
+ struct resource res;
+};
+
+static int damon_system_ram_walk_fn(struct resource *res, void *arg)
{
- struct resource *a = arg;
+ struct damon_system_ram_range_walk_arg *a = arg;
- if (resource_size(a) < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
+ if (!a->walked) {
+ a->walked = true;
+ a->res.start = res->start;
}
+ a->res.end = res->end;
return 0;
}
@@ -3120,27 +3624,24 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra,
return ra / addr_unit;
}
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool damon_find_biggest_system_ram(unsigned long *start,
+static bool damon_find_system_rams_range(unsigned long *start,
unsigned long *end, unsigned long addr_unit)
-
{
- struct resource res = {};
+ struct damon_system_ram_range_walk_arg arg = {};
- walk_system_ram_res(0, -1, &res, walk_system_ram);
- *start = damon_res_to_core_addr(res.start, addr_unit);
- *end = damon_res_to_core_addr(res.end + 1, addr_unit);
+ walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn);
+ if (!arg.walked)
+ return false;
+ *start = damon_res_to_core_addr(arg.res.start, addr_unit);
+ *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit);
if (*end <= *start)
return false;
return true;
}
/**
- * damon_set_region_biggest_system_ram_default() - Set the region of the given
- * monitoring target as requested, or biggest 'System RAM'.
+ * damon_set_region_system_rams_default() - Set the region of the given
+ * monitoring target as requested, or to cover all 'System RAM' resources.
* @t: The monitoring target to set the region.
* @start: The pointer to the start address of the region.
* @end: The pointer to the end address of the region.
@@ -3148,14 +3649,14 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* @min_region_sz: Minimum region size.
*
* This function sets the region of @t as requested by @start and @end. If the
- * values of @start and @end are zero, however, this function finds the biggest
- * 'System RAM' resource and sets the region to cover the resource. In the
- * latter case, this function saves the start and end addresses of the resource
- * in @start and @end, respectively.
+ * values of @start and @end are zero, however, this function finds 'System
+ * RAM' resources and sets the region to cover all the resource. In the latter
+ * case, this function saves the start and the end addresseses of the first and
+ * the last resources in @start and @end, respectively.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+int damon_set_region_system_rams_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
unsigned long addr_unit, unsigned long min_region_sz)
{
@@ -3165,7 +3666,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
return -EINVAL;
if (!*start && !*end &&
- !damon_find_biggest_system_ram(start, end, addr_unit))
+ !damon_find_system_rams_range(start, end, addr_unit))
return -EINVAL;
addr_range.start = *start;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 8cfe7bd3dc1d..8298c6001fd0 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* the re-reading, DAMON_LRU_SORT will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Desired active to [in]active memory ratio in bp (1/10,000).
@@ -140,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -149,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_LRU_SORT will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -285,6 +286,11 @@ static int damon_lru_sort_apply_parameters(void)
param_ctx->addr_unit = addr_unit;
param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1);
+ if (!is_power_of_2(param_ctx->min_region_sz)) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (!damon_lru_sort_mon_attrs.sample_interval) {
err = -EINVAL;
goto out;
@@ -327,7 +333,7 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
goto out;
- err = damon_set_region_biggest_system_ram_default(param_target,
+ err = damon_set_region_system_rams_default(param_target,
&monitor_region_start,
&monitor_region_end,
param_ctx->addr_unit,
@@ -340,18 +346,51 @@ out:
return err;
}
-static int damon_lru_sort_handle_commit_inputs(void)
+static int damon_lru_sort_commit_inputs_fn(void *arg)
+{
+ return damon_lru_sort_apply_parameters();
+}
+
+static int damon_lru_sort_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_lru_sort_commit_inputs_fn,
+ };
+
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
- if (!commit_inputs)
+ if (!commit_inputs_request)
return 0;
- err = damon_lru_sort_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_lru_sort_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_lru_sort_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -365,7 +404,7 @@ static int damon_lru_sort_damon_call_fn(void *arg)
damon_lru_sort_cold_stat = s->stat;
}
- return damon_lru_sort_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index c3e4c871b0bb..5c93ef2bb8a9 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
damon_max_nr_accesses(&c->attrs);
age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
+ if (age_in_sec)
+ age_in_log = min_t(int, ilog2(age_in_sec) + 1,
+ DAMON_MAX_AGE_IN_LOG);
+ else
+ age_in_log = 0;
+
/* If frequency is 0, higher age means it's colder */
if (freq_subscore == 0)
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5cdcc5037cbc..d0598f5f2688 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -49,11 +49,11 @@ static void damon_pa_mkold(phys_addr_t paddr)
}
static void __damon_pa_prepare_access_check(struct damon_region *r,
- unsigned long addr_unit)
+ struct damon_ctx *ctx)
{
- r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+ r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end);
- damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit));
+ damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, ctx->addr_unit));
}
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
@@ -63,7 +63,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(r, ctx->addr_unit);
+ __damon_pa_prepare_access_check(r, ctx);
}
}
@@ -120,6 +120,81 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
+static bool damon_pa_filter_match(struct damon_filter *filter,
+ struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+
+ switch (filter->type) {
+ case DAMON_FILTER_TYPE_ANON:
+ if (!folio) {
+ matched = false;
+ break;
+ }
+ matched = folio_test_anon(folio);
+ break;
+ case DAMON_FILTER_TYPE_MEMCG:
+ if (!folio) {
+ matched = false;
+ break;
+ }
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return matched == filter->matching;
+}
+
+static bool damon_pa_filter_pass(phys_addr_t pa, struct folio *folio,
+ struct damon_probe *p)
+{
+ struct damon_filter *f;
+ bool pass = true;
+
+ damon_for_each_filter(f, p) {
+ if (damon_pa_filter_match(f, folio)) {
+ pass = f->allow;
+ break;
+ }
+ pass = !f->allow;
+ }
+ return pass;
+}
+
+static void damon_pa_apply_probes(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ struct damon_probe *p;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t) {
+ int i = 0;
+ phys_addr_t pa;
+ struct folio *folio;
+
+ pa = damon_pa_phys_addr(r->sampling_addr,
+ ctx->addr_unit);
+ folio = damon_get_folio(PHYS_PFN(pa));
+ damon_for_each_probe(p, ctx) {
+ if (damon_pa_filter_pass(pa, folio, p))
+ r->probe_hits[i]++;
+ i++;
+ }
+ if (folio)
+ folio_put(folio);
+ }
+ }
+}
+
/*
* damos_pa_filter_out - Return true if the page should be filtered out.
*/
@@ -371,6 +446,7 @@ static int __init damon_pa_initcall(void)
.update = NULL,
.prepare_access_checks = damon_pa_prepare_access_checks,
.check_accesses = damon_pa_check_accesses,
+ .apply_probes = damon_pa_apply_probes,
.target_valid = NULL,
.apply_scheme = damon_pa_apply_scheme,
.get_scheme_score = damon_pa_scheme_score,
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 96f6dfc28eae..ce4499cf4b8b 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -39,7 +39,6 @@ static bool enabled __read_mostly;
* re-reading, DAMON_RECLAIM will be disabled.
*/
static bool commit_inputs __read_mostly;
-module_param(commit_inputs, bool, 0600);
/*
* Time threshold for cold memory regions identification in microseconds.
@@ -92,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600);
static unsigned long quota_autotune_feedback __read_mostly;
module_param(quota_autotune_feedback, ulong, 0600);
+/*
+ * Auto-tune monitoring intervals.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's
+ * sampling and aggregation intervals. The auto-tuning aims to capture
+ * meaningful amount of access events in each DAMON-snapshot, while keeping the
+ * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum.
+ * Setting this as ``N`` disables the auto-tuning.
+ *
+ * Disabled by default.
+ */
+static bool autotune_monitoring_intervals __read_mostly;
+module_param(autotune_monitoring_intervals, bool, 0600);
+
static struct damos_watermarks damon_reclaim_wmarks = {
.metric = DAMOS_WMARK_FREE_MEM_RATE,
.interval = 5000000, /* 5 seconds */
@@ -114,7 +127,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
* Start of the target memory region in physical address.
*
* The start physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_start __read_mostly;
module_param(monitor_region_start, ulong, 0600);
@@ -123,7 +137,8 @@ module_param(monitor_region_start, ulong, 0600);
* End of the target memory region in physical address.
*
* The end physical address of memory region that DAMON_RECLAIM will do work
- * against. By default, biggest System RAM is used as the region.
+ * against. By default, the system's entire physical memory is used as the
+ * region.
*/
static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
@@ -151,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
static struct damon_ctx *ctx;
static struct damon_target *target;
-static struct damos *damon_reclaim_new_scheme(void)
+static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval)
{
struct damos_access_pattern pattern = {
/* Find regions having PAGE_SIZE or larger size */
@@ -161,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void)
.min_nr_accesses = 0,
.max_nr_accesses = 0,
/* for min_age or more micro-seconds */
- .min_age_region = min_age /
- damon_reclaim_mon_attrs.aggr_interval,
+ .min_age_region = min_age / aggr_interval,
.max_age_region = UINT_MAX,
};
@@ -183,6 +197,7 @@ static int damon_reclaim_apply_parameters(void)
{
struct damon_ctx *param_ctx;
struct damon_target *param_target;
+ struct damon_attrs attrs;
struct damos *scheme;
struct damos_quota_goal *goal;
struct damos_filter *filter;
@@ -195,17 +210,31 @@ static int damon_reclaim_apply_parameters(void)
param_ctx->addr_unit = addr_unit;
param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1);
+ if (!is_power_of_2(param_ctx->min_region_sz)) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (!damon_reclaim_mon_attrs.aggr_interval) {
err = -EINVAL;
goto out;
}
- err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
+ attrs = damon_reclaim_mon_attrs;
+ if (autotune_monitoring_intervals) {
+ attrs.sample_interval = 5000;
+ attrs.aggr_interval = 100000;
+ attrs.intervals_goal.access_bp = 40;
+ attrs.intervals_goal.aggrs = 3;
+ attrs.intervals_goal.min_sample_us = 5000;
+ attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000;
+ }
+ err = damon_set_attrs(param_ctx, &attrs);
if (err)
goto out;
err = -ENOMEM;
- scheme = damon_reclaim_new_scheme();
+ scheme = damon_reclaim_new_scheme(attrs.aggr_interval);
if (!scheme)
goto out;
damon_set_schemes(param_ctx, &scheme, 1);
@@ -233,11 +262,9 @@ static int damon_reclaim_apply_parameters(void)
damos_add_filter(scheme, filter);
}
- err = damon_set_region_biggest_system_ram_default(param_target,
- &monitor_region_start,
- &monitor_region_end,
- param_ctx->addr_unit,
- param_ctx->min_region_sz);
+ err = damon_set_region_system_rams_default(param_target,
+ &monitor_region_start, &monitor_region_end,
+ param_ctx->addr_unit, param_ctx->min_region_sz);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
@@ -246,18 +273,51 @@ out:
return err;
}
-static int damon_reclaim_handle_commit_inputs(void)
+static int damon_reclaim_commit_inputs_fn(void *arg)
{
+ return damon_reclaim_apply_parameters();
+}
+
+static int damon_reclaim_commit_inputs_store(const char *val,
+ const struct kernel_param *kp)
+{
+ bool commit_inputs_request;
int err;
+ struct damon_call_control control = {
+ .fn = damon_reclaim_commit_inputs_fn,
+ };
- if (!commit_inputs)
+ if (!val) {
+ commit_inputs_request = true;
+ } else {
+ err = kstrtobool(val, &commit_inputs_request);
+ if (err)
+ return err;
+ }
+
+ if (!commit_inputs_request)
return 0;
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- return err;
+ /*
+ * Skip damon_call() if ctx is not initialized to avoid
+ * NULL pointer dereference.
+ */
+ if (!ctx)
+ return -EINVAL;
+
+ err = damon_call(ctx, &control);
+
+ return err ? err : control.return_code;
}
+static const struct kernel_param_ops commit_inputs_param_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = damon_reclaim_commit_inputs_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600);
+
static int damon_reclaim_damon_call_fn(void *arg)
{
struct damon_ctx *c = arg;
@@ -267,7 +327,7 @@ static int damon_reclaim_damon_call_fn(void *arg)
damon_for_each_scheme(s, c)
damon_reclaim_stat = s->stat;
- return damon_reclaim_handle_commit_inputs();
+ return 0;
}
static struct damon_call_control call_control = {
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 3951b762cbdd..0e14f5bb8f75 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data)
return 0;
}
-struct damon_stat_system_ram_range_walk_arg {
- bool walked;
- struct resource res;
-};
-
-static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg)
-{
- struct damon_stat_system_ram_range_walk_arg *a = arg;
-
- if (!a->walked) {
- a->walked = true;
- a->res.start = res->start;
- }
- a->res.end = res->end;
- return 0;
-}
-
-static unsigned long damon_stat_res_to_core_addr(resource_size_t ra,
- unsigned long addr_unit)
-{
- /*
- * Use div_u64() for avoiding linking errors related with __udivdi3,
- * __aeabi_uldivmod, or similar problems. This should also improve the
- * performance optimization (read div_u64() comment for the detail).
- */
- if (sizeof(ra) == 8 && sizeof(addr_unit) == 4)
- return div_u64(ra, addr_unit);
- return ra / addr_unit;
-}
-
-static int damon_stat_set_monitoring_region(struct damon_target *t,
- unsigned long addr_unit, unsigned long min_region_sz)
-{
- struct damon_addr_range addr_range;
- struct damon_stat_system_ram_range_walk_arg arg = {};
-
- walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn);
- if (!arg.walked)
- return -EINVAL;
- addr_range.start = damon_stat_res_to_core_addr(
- arg.res.start, addr_unit);
- addr_range.end = damon_stat_res_to_core_addr(
- arg.res.end + 1, addr_unit);
- if (addr_range.end <= addr_range.start)
- return -EINVAL;
- return damon_set_regions(t, &addr_range, 1, min_region_sz);
-}
-
static struct damon_ctx *damon_stat_build_ctx(void)
{
struct damon_ctx *ctx;
struct damon_attrs attrs;
struct damon_target *target;
+ unsigned long start = 0, end = 0;
ctx = damon_new_ctx();
if (!ctx)
@@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (!target)
goto free_out;
damon_add_target(ctx, target);
- if (damon_stat_set_monitoring_region(target, ctx->addr_unit,
- ctx->min_region_sz))
+ if (damon_set_region_system_rams_default(target, &start, &end,
+ ctx->addr_unit, ctx->min_region_sz))
goto free_out;
return ctx;
free_out:
@@ -313,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp)
return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N');
}
+static int damon_stat_kdamond_pid_store(
+ const char *val, const struct kernel_param *kp)
+{
+ /*
+ * kdamond_pid is read-only, but kernel command line could write it.
+ * Do nothing here.
+ */
+ return 0;
+}
+
+static int damon_stat_kdamond_pid_load(
+ char *buffer, const struct kernel_param *kp)
+{
+ int pid;
+
+ if (!damon_stat_context) {
+ pid = -1;
+ } else {
+ pid = damon_kdamond_pid(damon_stat_context);
+ if (pid < 1)
+ pid = -1;
+ }
+ return sprintf(buffer, "%d\n", pid);
+}
+
+static const struct kernel_param_ops kdamond_pid_param_ops = {
+ .set = damon_stat_kdamond_pid_store,
+ .get = damon_stat_kdamond_pid_load,
+};
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_STAT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400);
+MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond");
+
static int __init damon_stat_init(void)
{
int err = 0;
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 83e24a9b5a0d..bdc6ae2639e4 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -104,3 +104,44 @@ const struct kobj_type damon_sysfs_ul_range_ktype = {
.default_groups = damon_sysfs_ul_range_groups,
};
+
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+ char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+ cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+ if (sysfs_streq(memcg_path_buf, path))
+ return true;
+#endif /* CONFIG_MEMCG */
+ return false;
+}
+
+int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id)
+{
+ struct mem_cgroup *memcg;
+ char *path;
+ bool found = false;
+
+ if (!memcg_path)
+ return -EINVAL;
+
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+ memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+ /* skip offlined memcg */
+ if (!mem_cgroup_online(memcg))
+ continue;
+ if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+ *id = mem_cgroup_id(memcg);
+ found = true;
+ mem_cgroup_iter_break(NULL, memcg);
+ break;
+ }
+ }
+
+ kfree(path);
+ return found ? 0 : -EINVAL;
+}
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 2099adee11d0..3079306966a9 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -59,3 +59,5 @@ int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
void damos_sysfs_update_effective_quotas(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
+
+int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index a8014780edae..329cfd0bbe9f 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -11,6 +11,140 @@
#include "sysfs-common.h"
/*
+ * probe directory
+ */
+
+struct damos_sysfs_probe {
+ struct kobject kobj;
+ unsigned char hits;
+};
+
+static struct damos_sysfs_probe *damos_sysfs_probe_alloc(unsigned char hits)
+{
+ struct damos_sysfs_probe *probe;
+
+ probe = kzalloc_obj(*probe);
+ if (!probe)
+ return NULL;
+ probe->hits = hits;
+ return probe;
+}
+
+static ssize_t hits_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damos_sysfs_probe *probe = container_of(kobj,
+ struct damos_sysfs_probe, kobj);
+
+ return sysfs_emit(buf, "%hhu\n", probe->hits);
+}
+
+static void damos_sysfs_probe_release(struct kobject *kobj)
+{
+ struct damos_sysfs_probe *probe = container_of(kobj,
+ struct damos_sysfs_probe, kobj);
+
+ kfree(probe);
+}
+
+static struct kobj_attribute damos_sysfs_probe_hits_attr =
+ __ATTR_RO_MODE(hits, 0400);
+
+static struct attribute *damos_sysfs_probe_attrs[] = {
+ &damos_sysfs_probe_hits_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damos_sysfs_probe);
+
+static const struct kobj_type damos_sysfs_probe_ktype = {
+ .release = damos_sysfs_probe_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damos_sysfs_probe_groups,
+};
+
+/*
+ * probes directory
+ */
+
+struct damos_sysfs_probes {
+ struct kobject kobj;
+ struct damos_sysfs_probe **probes_arr;
+ int nr;
+};
+
+static struct damos_sysfs_probes *damos_sysfs_probes_alloc(void)
+{
+ return kzalloc_obj(struct damos_sysfs_probes);
+}
+
+static void damos_sysfs_probes_rm_dirs(struct damos_sysfs_probes *probes)
+{
+ struct damos_sysfs_probe **probes_arr = probes->probes_arr;
+ int i;
+
+ for (i = 0; i < probes->nr; i++)
+ kobject_put(&probes_arr[i]->kobj);
+ probes->nr = 0;
+ kfree(probes_arr);
+ probes->probes_arr = NULL;
+}
+
+static int damos_sysfs_probes_add_dirs(struct damos_sysfs_probes *probes,
+ struct damon_ctx *ctx, struct damon_region *region)
+{
+ struct damon_probe *probe;
+ struct damos_sysfs_probe **probes_arr;
+ int i = 0;
+
+ damon_for_each_probe(probe, ctx)
+ i++;
+
+ if (!i)
+ return 0;
+
+ probes_arr = kmalloc_objs(*probes_arr, i);
+ if (!probes_arr)
+ return -ENOMEM;
+ probes->probes_arr = probes_arr;
+
+ i = 0;
+ damon_for_each_probe(probe, ctx) {
+ struct damos_sysfs_probe *sys_probe;
+ int err;
+
+ sys_probe = damos_sysfs_probe_alloc(region->probe_hits[i]);
+ if (!sys_probe) {
+ damos_sysfs_probes_rm_dirs(probes);
+ return -ENOMEM;
+ }
+ err = kobject_init_and_add(&sys_probe->kobj,
+ &damos_sysfs_probe_ktype, &probes->kobj, "%d",
+ i);
+ if (err) {
+ kobject_put(&sys_probe->kobj);
+ damos_sysfs_probes_rm_dirs(probes);
+ return err;
+ }
+ probes_arr[i++] = sys_probe;
+ probes->nr++;
+ }
+ return 0;
+}
+
+static void damos_sysfs_probes_release(struct kobject *kobj)
+{
+ struct damos_sysfs_probes *probes = container_of(kobj,
+ struct damos_sysfs_probes, kobj);
+
+ kfree(probes);
+}
+
+static const struct kobj_type damos_sysfs_probes_ktype = {
+ .release = damos_sysfs_probes_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
* scheme region directory
*/
@@ -20,6 +154,7 @@ struct damon_sysfs_scheme_region {
unsigned int nr_accesses;
unsigned int age;
unsigned long sz_filter_passed;
+ struct damos_sysfs_probes *probes;
struct list_head list;
};
@@ -34,10 +169,44 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
sysfs_region->ar = region->ar;
sysfs_region->nr_accesses = region->nr_accesses_bp / 10000;
sysfs_region->age = region->age;
+ sysfs_region->probes = NULL;
INIT_LIST_HEAD(&sysfs_region->list);
return sysfs_region;
}
+static int damos_sysfs_region_add_dirs(
+ struct damon_sysfs_scheme_region *region,
+ struct damon_ctx *ctx,
+ struct damon_region *dregion)
+{
+ struct damos_sysfs_probes *probes = damos_sysfs_probes_alloc();
+ int err;
+
+ if (!probes)
+ return -ENOMEM;
+ err = kobject_init_and_add(&probes->kobj, &damos_sysfs_probes_ktype,
+ &region->kobj, "probes");
+ if (err)
+ goto fail;
+ err = damos_sysfs_probes_add_dirs(probes, ctx, dregion);
+ if (err)
+ goto fail;
+
+ region->probes = probes;
+ return 0;
+
+fail:
+ kobject_put(&probes->kobj);
+ return err;
+}
+
+static void damos_sysfs_region_rm_dirs(
+ struct damon_sysfs_scheme_region *region)
+{
+ damos_sysfs_probes_rm_dirs(region->probes);
+ kobject_put(&region->probes->kobj);
+}
+
static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -163,6 +332,7 @@ static void damon_sysfs_scheme_regions_rm_dirs(
struct damon_sysfs_scheme_region *r, *next;
list_for_each_entry_safe(r, next, &regions->regions_list, list) {
+ damos_sysfs_region_rm_dirs(r);
list_del(&r->list);
kobject_put(&r->kobj);
regions->nr_regions--;
@@ -1093,6 +1263,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_INACTIVE_MEM_BP,
.name = "inactive_mem_bp",
},
+ {
+ .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP,
+ .name = "node_eligible_mem_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -1508,6 +1682,8 @@ struct damon_sysfs_quotas {
unsigned long reset_interval_ms;
unsigned long effective_sz; /* Effective size quota in bytes */
enum damos_quota_goal_tuner goal_tuner;
+ unsigned int fail_charge_num;
+ unsigned int fail_charge_denom;
};
static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
@@ -1682,6 +1858,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj,
return -EINVAL;
}
+static ssize_t fail_charge_num_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_num);
+}
+
+static ssize_t fail_charge_num_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_num);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t fail_charge_denom_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom);
+}
+
+static ssize_t fail_charge_denom_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtouint(buf, 0, &quotas->fail_charge_denom);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
static void damon_sysfs_quotas_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
@@ -1702,12 +1920,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr =
static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr =
__ATTR_RW_MODE(goal_tuner, 0600);
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr =
+ __ATTR_RW_MODE(fail_charge_num, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr =
+ __ATTR_RW_MODE(fail_charge_denom, 0600);
+
static struct attribute *damon_sysfs_quotas_attrs[] = {
&damon_sysfs_quotas_ms_attr.attr,
&damon_sysfs_quotas_sz_attr.attr,
&damon_sysfs_quotas_reset_interval_ms_attr.attr,
&damon_sysfs_quotas_effective_bytes_attr.attr,
&damon_sysfs_quotas_goal_tuner_attr.attr,
+ &damon_sysfs_quotas_fail_charge_num_attr.attr,
+ &damon_sysfs_quotas_fail_charge_denom_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_quotas);
@@ -2061,6 +2287,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
.name = "nohugepage",
},
{
+ .action = DAMOS_COLLAPSE,
+ .name = "collapse",
+ },
+ {
.action = DAMOS_LRU_PRIO,
.name = "lru_prio",
},
@@ -2561,47 +2791,6 @@ const struct kobj_type damon_sysfs_schemes_ktype = {
.default_groups = damon_sysfs_schemes_groups,
};
-static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
- char *memcg_path_buf, char *path)
-{
-#ifdef CONFIG_MEMCG
- cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
- if (sysfs_streq(memcg_path_buf, path))
- return true;
-#endif /* CONFIG_MEMCG */
- return false;
-}
-
-static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id)
-{
- struct mem_cgroup *memcg;
- char *path;
- bool found = false;
-
- if (!memcg_path)
- return -EINVAL;
-
- path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
- if (!path)
- return -ENOMEM;
-
- for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
- memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
- /* skip offlined memcg */
- if (!mem_cgroup_online(memcg))
- continue;
- if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
- *id = mem_cgroup_id(memcg);
- found = true;
- mem_cgroup_iter_break(NULL, memcg);
- break;
- }
- }
-
- kfree(path);
- return found ? 0 : -EINVAL;
-}
-
static int damon_sysfs_add_scheme_filters(struct damos *scheme,
struct damon_sysfs_scheme_filters *sysfs_filters)
{
@@ -2685,6 +2874,9 @@ static int damos_sysfs_add_quota_score(
}
goal->nid = sysfs_goal->nid;
break;
+ case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
default:
break;
}
@@ -2796,6 +2988,8 @@ static struct damos *damon_sysfs_mk_scheme(
.weight_nr_accesses = sysfs_weights->nr_accesses,
.weight_age = sysfs_weights->age,
.goal_tuner = sysfs_quotas->goal_tuner,
+ .fail_charge_num = sysfs_quotas->fail_charge_num,
+ .fail_charge_denom = sysfs_quotas->fail_charge_denom,
};
struct damos_watermarks wmarks = {
.metric = sysfs_wmarks->metric,
@@ -2930,12 +3124,17 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
if (kobject_init_and_add(&region->kobj,
&damon_sysfs_scheme_region_ktype,
&sysfs_regions->kobj, "%d",
- sysfs_regions->nr_regions++)) {
- kobject_put(&region->kobj);
- return;
- }
+ sysfs_regions->nr_regions))
+ goto out;
+ if (damos_sysfs_region_add_dirs(region, ctx, r))
+ goto out;
+
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
+ return;
+
+out:
+ kobject_put(&region->kobj);
}
int damon_sysfs_schemes_clear_regions(
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index eefa959aa30a..2e95e3bac774 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -748,6 +748,497 @@ static const struct kobj_type damon_sysfs_intervals_ktype = {
};
/*
+ * filter directory
+ */
+
+struct damon_sysfs_filter {
+ struct kobject kobj;
+ enum damon_filter_type type;
+ bool matching;
+ bool allow;
+ char *path;
+};
+
+static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void)
+{
+ return kzalloc_obj(struct damon_sysfs_filter);
+}
+
+struct damon_sysfs_filter_type_name {
+ enum damon_filter_type type;
+ char *name;
+};
+
+static const struct damon_sysfs_filter_type_name
+damon_sysfs_filter_type_names[] = {
+ {
+ .type = DAMON_FILTER_TYPE_ANON,
+ .name = "anon",
+ },
+ {
+ .type = DAMON_FILTER_TYPE_MEMCG,
+ .name = "memcg",
+ },
+};
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) {
+ const struct damon_sysfs_filter_type_name *type_name;
+
+ type_name = &damon_sysfs_filter_type_names[i];
+ if (type_name->type == filter->type)
+ return sysfs_emit(buf, "%s\n", type_name->name);
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ ssize_t ret = -EINVAL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) {
+ const struct damon_sysfs_filter_type_name *type_name;
+
+ type_name = &damon_sysfs_filter_type_names[i];
+ if (sysfs_streq(buf, type_name->name)) {
+ filter->type = type_name->type;
+ ret = count;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ bool matching;
+ int err = kstrtobool(buf, &matching);
+
+ if (err)
+ return err;
+
+ filter->matching = matching;
+ return count;
+}
+
+static ssize_t allow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N');
+}
+
+static ssize_t allow_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ bool allow;
+ int err = kstrtobool(buf, &allow);
+
+ if (err)
+ return err;
+
+ filter->allow = allow;
+ return count;
+}
+
+static ssize_t path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ int len;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ len = sysfs_emit(buf, "%s\n", filter->path ? filter->path : "");
+ mutex_unlock(&damon_sysfs_lock);
+ return len;
+}
+
+static ssize_t path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+ char *path = kmalloc_objs(*path, size_add(count, 1));
+
+ if (!path)
+ return -ENOMEM;
+ strscpy(path, buf, size_add(count, 1));
+ if (!mutex_trylock(&damon_sysfs_lock)) {
+ kfree(path);
+ return -EBUSY;
+ }
+ kfree(filter->path);
+ filter->path = path;
+ mutex_unlock(&damon_sysfs_lock);
+ return count;
+}
+
+static void damon_sysfs_filter_release(struct kobject *kobj)
+{
+ struct damon_sysfs_filter *filter = container_of(kobj,
+ struct damon_sysfs_filter, kobj);
+
+ kfree(filter->path);
+ kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_filter_type_attr =
+ __ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_filter_matching_attr =
+ __ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_filter_allow_attr =
+ __ATTR_RW_MODE(allow, 0600);
+
+static struct kobj_attribute damon_sysfs_filter_path_attr =
+ __ATTR_RW_MODE(path, 0600);
+
+static struct attribute *damon_sysfs_filter_attrs[] = {
+ &damon_sysfs_filter_type_attr.attr,
+ &damon_sysfs_filter_matching_attr.attr,
+ &damon_sysfs_filter_allow_attr.attr,
+ &damon_sysfs_filter_path_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_filter);
+
+static const struct kobj_type damon_sysfs_filter_ktype = {
+ .release = damon_sysfs_filter_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_filter_groups,
+};
+
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_filters {
+ struct kobject kobj;
+ struct damon_sysfs_filter **filters_arr;
+ int nr;
+};
+
+static struct damon_sysfs_filters *damon_sysfs_filters_alloc(void)
+{
+ return kzalloc_obj(struct damon_sysfs_filters);
+}
+
+static void damon_sysfs_filters_rm_dirs(struct damon_sysfs_filters *filters)
+{
+ struct damon_sysfs_filter **filters_arr = filters->filters_arr;
+ int i;
+
+ for (i = 0; i < filters->nr; i++)
+ kobject_put(&filters_arr[i]->kobj);
+ filters->nr = 0;
+ kfree(filters_arr);
+ filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_filters_add_dirs(
+ struct damon_sysfs_filters *filters, int nr_filters)
+{
+ struct damon_sysfs_filter **filters_arr, *filter;
+ int err, i;
+
+ damon_sysfs_filters_rm_dirs(filters);
+ if (!nr_filters)
+ return 0;
+
+ filters_arr = kmalloc_objs(*filters_arr, nr_filters,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!filters_arr)
+ return -ENOMEM;
+ filters->filters_arr = filters_arr;
+
+ for (i = 0; i < nr_filters; i++) {
+ filter = damon_sysfs_filter_alloc();
+ if (!filter) {
+ damon_sysfs_filters_rm_dirs(filters);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&filter->kobj,
+ &damon_sysfs_filter_ktype, &filters->kobj,
+ "%d", i);
+ if (err) {
+ kobject_put(&filter->kobj);
+ damon_sysfs_filters_rm_dirs(filters);
+ return err;
+ }
+
+ filters_arr[i] = filter;
+ filters->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_filters *filters = container_of(kobj,
+ struct damon_sysfs_filters, kobj);
+
+ return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_filters *filters;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ filters = container_of(kobj, struct damon_sysfs_filters, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_filters_add_dirs(filters, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_filters_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_filters_nr_attr =
+ __ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_filters_attrs[] = {
+ &damon_sysfs_filters_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_filters);
+
+static const struct kobj_type damon_sysfs_filters_ktype = {
+ .release = damon_sysfs_filters_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_filters_groups,
+};
+
+/*
+ * probe directory
+ */
+
+struct damon_sysfs_probe {
+ struct kobject kobj;
+ struct damon_sysfs_filters *filters;
+};
+
+static struct damon_sysfs_probe *damon_sysfs_probe_alloc(void)
+{
+ return kzalloc_obj(struct damon_sysfs_probe);
+}
+
+static int damon_sysfs_probe_add_dirs(struct damon_sysfs_probe *attr)
+{
+ struct damon_sysfs_filters *filters;
+ int err;
+
+ filters = damon_sysfs_filters_alloc();
+ if (!filters)
+ return -ENOMEM;
+ attr->filters = filters;
+
+ err = kobject_init_and_add(&filters->kobj, &damon_sysfs_filters_ktype,
+ &attr->kobj, "filters");
+ if (err) {
+ kobject_put(&filters->kobj);
+ attr->filters = NULL;
+ }
+ return err;
+}
+
+static void damon_sysfs_probe_rm_dirs(struct damon_sysfs_probe *attr)
+{
+ if (attr->filters) {
+ damon_sysfs_filters_rm_dirs(attr->filters);
+ kobject_put(&attr->filters->kobj);
+ }
+}
+
+static void damon_sysfs_probe_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_probe, kobj));
+}
+
+static struct attribute *damon_sysfs_probe_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_probe);
+
+static const struct kobj_type damon_sysfs_probe_ktype = {
+ .release = damon_sysfs_probe_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_probe_groups,
+};
+
+/*
+ * probes directory
+ */
+
+struct damon_sysfs_probes {
+ struct kobject kobj;
+ struct damon_sysfs_probe **probes_arr;
+ int nr;
+};
+
+static struct damon_sysfs_probes *damon_sysfs_probes_alloc(void)
+{
+ return kzalloc_obj(struct damon_sysfs_probes);
+}
+
+static void damon_sysfs_probes_rm_dirs(
+ struct damon_sysfs_probes *probes)
+{
+ struct damon_sysfs_probe **probes_arr = probes->probes_arr;
+ int i;
+
+ for (i = 0; i < probes->nr; i++) {
+ damon_sysfs_probe_rm_dirs(probes_arr[i]);
+ kobject_put(&probes_arr[i]->kobj);
+ }
+ probes->nr = 0;
+ kfree(probes_arr);
+ probes->probes_arr = NULL;
+}
+
+static int damon_sysfs_probes_add_dirs(
+ struct damon_sysfs_probes *probes, int nr_probes)
+{
+ struct damon_sysfs_probe **probes_arr, *probe;
+ int err, i;
+
+ damon_sysfs_probes_rm_dirs(probes);
+ if (!nr_probes)
+ return 0;
+
+ probes_arr = kmalloc_objs(*probes_arr, nr_probes,
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!probes_arr)
+ return -ENOMEM;
+ probes->probes_arr = probes_arr;
+
+ for (i = 0; i < nr_probes; i++) {
+ probe = damon_sysfs_probe_alloc();
+ if (!probe) {
+ damon_sysfs_probes_rm_dirs(probes);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&probe->kobj,
+ &damon_sysfs_probe_ktype, &probes->kobj,
+ "%d", i);
+ if (err) {
+ kobject_put(&probe->kobj);
+ damon_sysfs_probes_rm_dirs(probes);
+ return err;
+ }
+
+ err = damon_sysfs_probe_add_dirs(probe);
+ if (err) {
+ kobject_put(&probe->kobj);
+ damon_sysfs_probes_rm_dirs(probes);
+ return err;
+ }
+
+ probes_arr[i] = probe;
+ probes->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_probes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_probes *probes = container_of(kobj,
+ struct damon_sysfs_probes, kobj);
+
+ return sysfs_emit(buf, "%d\n", probes->nr);
+}
+
+static ssize_t nr_probes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_probes *probes;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0 || nr > DAMON_MAX_PROBES)
+ return -EINVAL;
+
+ probes = container_of(kobj, struct damon_sysfs_probes, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_probes_add_dirs(probes, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_probes_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_probes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_probes_nr_probes =
+ __ATTR_RW_MODE(nr_probes, 0600);
+
+static struct attribute *damon_sysfs_probes_attrs[] = {
+ &damon_sysfs_probes_nr_probes.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_probes);
+
+static const struct kobj_type damon_sysfs_probes_ktype = {
+ .release = damon_sysfs_probes_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_probes_groups,
+};
+
+/*
* monitoring_attrs directory
*/
@@ -755,6 +1246,7 @@ struct damon_sysfs_attrs {
struct kobject kobj;
struct damon_sysfs_intervals *intervals;
struct damon_sysfs_ul_range *nr_regions_range;
+ struct damon_sysfs_probes *probes;
};
static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void)
@@ -771,6 +1263,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
{
struct damon_sysfs_intervals *intervals;
struct damon_sysfs_ul_range *nr_regions_range;
+ struct damon_sysfs_probes *probes;
int err;
intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000);
@@ -799,8 +1292,22 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
if (err)
goto put_nr_regions_intervals_out;
attrs->nr_regions_range = nr_regions_range;
+
+ probes = damon_sysfs_probes_alloc();
+ if (!probes) {
+ err = -ENOMEM;
+ goto put_nr_regions_intervals_out;
+ }
+ err = kobject_init_and_add(&probes->kobj,
+ &damon_sysfs_probes_ktype, &attrs->kobj, "probes");
+ if (err)
+ goto put_probes_out;
+ attrs->probes = probes;
return 0;
+put_probes_out:
+ kobject_put(&probes->kobj);
+ attrs->probes = NULL;
put_nr_regions_intervals_out:
kobject_put(&nr_regions_range->kobj);
attrs->nr_regions_range = NULL;
@@ -817,6 +1324,8 @@ static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
kobject_put(&attrs->nr_regions_range->kobj);
damon_sysfs_intervals_rm_dirs(attrs->intervals);
kobject_put(&attrs->intervals->kobj);
+ damon_sysfs_probes_rm_dirs(attrs->probes);
+ kobject_put(&attrs->probes->kobj);
}
static void damon_sysfs_attrs_release(struct kobject *kobj)
@@ -866,6 +1375,7 @@ struct damon_sysfs_context {
struct damon_sysfs_attrs *attrs;
struct damon_sysfs_targets *targets;
struct damon_sysfs_schemes *schemes;
+ bool pause;
};
static struct damon_sysfs_context *damon_sysfs_context_alloc(
@@ -878,6 +1388,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc(
context->kobj = (struct kobject){};
context->ops_id = ops_id;
context->addr_unit = 1;
+ context->pause = false;
return context;
}
@@ -1053,6 +1564,30 @@ static ssize_t addr_unit_store(struct kobject *kobj,
return count;
}
+static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N');
+}
+
+static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ bool pause;
+ int err = kstrtobool(buf, &pause);
+
+ if (err)
+ return err;
+ context->pause = pause;
+ return count;
+}
+
+
static void damon_sysfs_context_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
@@ -1067,10 +1602,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr =
static struct kobj_attribute damon_sysfs_context_addr_unit_attr =
__ATTR_RW_MODE(addr_unit, 0600);
+static struct kobj_attribute damon_sysfs_context_pause_attr =
+ __ATTR_RW_MODE(pause, 0600);
+
static struct attribute *damon_sysfs_context_attrs[] = {
&damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
&damon_sysfs_context_addr_unit_attr.attr,
+ &damon_sysfs_context_pause_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_context);
@@ -1360,6 +1899,51 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
return damon_set_attrs(ctx, &attrs);
}
+static int damon_sysfs_set_probes(struct damon_ctx *ctx,
+ struct damon_sysfs_probes *sys_probes)
+{
+ int i;
+
+ for (i = 0; i < sys_probes->nr; i++) {
+ struct damon_sysfs_filters *sys_filters =
+ sys_probes->probes_arr[i]->filters;
+ struct damon_probe *c;
+ int j;
+
+ if (!sys_filters)
+ continue;
+ c = damon_new_probe();
+ if (!c)
+ return -ENOMEM;
+ damon_add_probe(ctx, c);
+
+ for (j = 0; j < sys_filters->nr; j++) {
+ struct damon_sysfs_filter *sys_filter =
+ sys_filters->filters_arr[j];
+ struct damon_filter *filter;
+
+ filter = damon_new_filter(sys_filter->type,
+ sys_filter->matching,
+ sys_filter->allow);
+ if (!filter)
+ return -ENOMEM;
+ if (filter->type == DAMON_FILTER_TYPE_MEMCG) {
+ int err;
+
+ err = damon_sysfs_memcg_path_to_id(
+ sys_filter->path,
+ &filter->memcg_id);
+ if (err) {
+ damon_destroy_filter(filter);
+ return err;
+ }
+ }
+ damon_add_filter(c, filter);
+ }
+ }
+ return 0;
+}
+
static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_regions *sysfs_regions,
unsigned long min_region_sz)
@@ -1470,9 +2054,13 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
if (sys_ctx->ops_id == DAMON_OPS_PADDR)
ctx->min_region_sz = max(
DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1);
+ ctx->pause = sys_ctx->pause;
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
+ err = damon_sysfs_set_probes(ctx, sys_ctx->attrs->probes);
+ if (err)
+ return err;
err = damon_sysfs_add_targets(ctx, sys_ctx->targets);
if (err)
return err;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 9e5904c2beeb..1cfb8c176b87 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -273,54 +273,70 @@ static void damon_test_merge_regions_of(struct kunit *test)
static void damon_test_split_regions_of(struct kunit *test)
{
+ struct damon_ctx *c;
struct damon_target *t;
struct damon_region *r;
unsigned long sa[] = {0, 300, 500};
unsigned long ea[] = {220, 400, 700};
int i;
+ c = damon_new_ctx();
+ if (!c)
+ kunit_skip(test, "ctx alloc fail");
+
t = damon_new_target();
- if (!t)
+ if (!t) {
+ damon_destroy_ctx(c);
kunit_skip(test, "target alloc fail");
+ }
r = damon_new_region(0, 22);
if (!r) {
damon_free_target(t);
+ damon_destroy_ctx(c);
kunit_skip(test, "region alloc fail");
}
damon_add_region(r, t);
- damon_split_regions_of(t, 2, 1);
+ damon_split_regions_of(c, t, 2, 1);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
- if (!t)
+ if (!t) {
+ damon_destroy_ctx(c);
kunit_skip(test, "second target alloc fail");
+ }
r = damon_new_region(0, 220);
if (!r) {
damon_free_target(t);
+ damon_destroy_ctx(c);
kunit_skip(test, "second region alloc fail");
}
damon_add_region(r, t);
- damon_split_regions_of(t, 4, 1);
+ damon_split_regions_of(c, t, 4, 1);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
t = damon_new_target();
- if (!t)
+ if (!t) {
+ damon_destroy_ctx(c);
kunit_skip(test, "third target alloc fail");
+ }
for (i = 0; i < ARRAY_SIZE(sa); i++) {
r = damon_new_region(sa[i], ea[i]);
if (!r) {
damon_free_target(t);
+ damon_destroy_ctx(c);
kunit_skip(test, "region alloc fail");
}
damon_add_region(r, t);
}
- damon_split_regions_of(t, 4, 5);
+ damon_split_regions_of(c, t, 4, 5);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u);
damon_for_each_region(r, t)
KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul);
damon_free_target(t);
+
+ damon_destroy_ctx(c);
}
static void damon_test_ops_registration(struct kunit *test)
@@ -374,41 +390,139 @@ static void damon_test_ops_registration(struct kunit *test)
}
}
-static void damon_test_set_regions(struct kunit *test)
+static void damon_test_set_regions_for(struct kunit *test,
+ struct damon_addr_range *old_ranges, int sz_old_ranges,
+ struct damon_addr_range *new_ranges, int sz_new_ranges,
+ unsigned long min_region_sz,
+ struct damon_addr_range *expect_ranges, int sz_expect_ranges)
{
- struct damon_target *t = damon_new_target();
- struct damon_region *r1, *r2;
- struct damon_addr_range range = {.start = 8, .end = 28};
- unsigned long expects[] = {8, 16, 16, 24, 24, 28};
- int expect_idx = 0;
+ struct damon_target *t;
struct damon_region *r;
+ int i;
+ t = damon_new_target();
if (!t)
kunit_skip(test, "target alloc fail");
- r1 = damon_new_region(4, 16);
- if (!r1) {
- damon_free_target(t);
- kunit_skip(test, "region alloc fail");
- }
- r2 = damon_new_region(24, 32);
- if (!r2) {
- damon_free_target(t);
- damon_free_region(r1);
- kunit_skip(test, "second region alloc fail");
+ for (i = 0; i < sz_old_ranges; i++) {
+ r = damon_new_region(old_ranges[i].start, old_ranges[i].end);
+ if (!r) {
+ damon_destroy_target(t, NULL);
+ kunit_skip(test, "%d-th r alloc fail\n", i);
+ }
+ damon_add_region(r, t);
}
- damon_add_region(r1, t);
- damon_add_region(r2, t);
- damon_set_regions(t, &range, 1, 1);
+ damon_set_regions(t, new_ranges, sz_new_ranges, min_region_sz);
- KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), sz_expect_ranges);
+ if (damon_nr_regions(t) != sz_expect_ranges) {
+ damon_destroy_target(t, NULL);
+ return;
+ }
+ i = 0;
damon_for_each_region(r, t) {
- KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
- KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
+ KUNIT_EXPECT_EQ(test, r->ar.start, expect_ranges[i].start);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expect_ranges[i++].end);
}
+
damon_destroy_target(t, NULL);
}
+static void damon_test_set_regions(struct kunit *test)
+{
+ /* Initial build up on empty target. */
+ damon_test_set_regions_for(test,
+ (struct damon_addr_range[]){}, 0,
+ (struct damon_addr_range[]){
+ {.start = 5, .end = 15},
+ {.start = 15, .end = 25},
+ }, 2,
+ 1,
+ (struct damon_addr_range[]){
+ {.start = 5, .end = 15},
+ {.start = 15, .end = 25},
+ }, 2);
+ /* Un-intersecting regions should be removed. */
+ damon_test_set_regions_for(test,
+ (struct damon_addr_range[]){
+ {.start = 4, .end = 16},
+ {.start = 24, .end = 32},
+ }, 2,
+ (struct damon_addr_range[]){
+ {.start = 18, .end = 23},
+ }, 1,
+ 1,
+ (struct damon_addr_range[]){
+ {.start = 18, .end = 23},
+ }, 1);
+ /*
+ * Holes should be filled up with new regions.
+ *
+ * old: [4, 16) [24, 32)
+ * new: [8, 28)
+ * expect: [8, 16)[16,24),[24, 28)
+ */
+ damon_test_set_regions_for(test,
+ (struct damon_addr_range[]){
+ {.start = 4, .end = 16},
+ {.start = 24, .end = 32},
+ }, 2,
+ (struct damon_addr_range[]){
+ {.start = 8, .end = 28},
+ }, 1,
+ 1,
+ (struct damon_addr_range[]){
+ {.start = 8, .end = 16},
+ {.start = 16, .end = 24},
+ {.start = 24, .end = 28},
+ }, 3);
+ /*
+ * New regions should be able to be appended.
+ *
+ * old: [0, 4)[4, 17)
+ * new: [0, 15) [25, 40)
+ * expect: [0, 4)[4, 15) [25, 40)
+ */
+ damon_test_set_regions_for(test,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 4},
+ {.start = 4, .end = 17},
+ }, 2,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 15},
+ {.start = 25, .end = 40},
+ }, 2,
+ 1,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 4},
+ {.start = 4, .end = 15},
+ {.start = 25, .end = 40},
+ }, 3);
+ /*
+ * New regions should be able to be inserted.
+ *
+ * old: [0, 4) [42, 52)
+ * new: [0, 15) [25, 40) [44, 50)
+ * expect: [0, 15) [25, 40) [44, 50)
+ */
+ damon_test_set_regions_for(test,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 4},
+ {.start = 42, .end = 52},
+ }, 2,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 15},
+ {.start = 25, .end = 40},
+ {.start = 44, .end = 50},
+ }, 3,
+ 1,
+ (struct damon_addr_range[]){
+ {.start = 0, .end = 15},
+ {.start = 25, .end = 40},
+ {.start = 44, .end = 50},
+ }, 3);
+}
+
static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
{
struct damon_attrs attrs = {
@@ -694,6 +808,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 2,
.sz = 3,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST,
+ .fail_charge_num = 2,
+ .fail_charge_denom = 3,
.weight_sz = 4,
.weight_nr_accesses = 5,
.weight_age = 6,
@@ -703,6 +819,8 @@ static void damos_test_commit_quota(struct kunit *test)
.ms = 8,
.sz = 9,
.goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL,
+ .fail_charge_num = 1,
+ .fail_charge_denom = 1024,
.weight_sz = 10,
.weight_nr_accesses = 11,
.weight_age = 12,
@@ -717,6 +835,8 @@ static void damos_test_commit_quota(struct kunit *test)
KUNIT_EXPECT_EQ(test, dst.ms, src.ms);
KUNIT_EXPECT_EQ(test, dst.sz, src.sz);
KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num);
+ KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom);
KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz);
KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses);
KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age);
@@ -1077,6 +1197,10 @@ static void damon_test_commit_ctx(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
src->min_region_sz = 4095;
KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL);
+ src->min_region_sz = 4096;
+ src->pause = true;
+ KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0);
+ KUNIT_EXPECT_TRUE(test, dst->pause);
damon_destroy_ctx(src);
damon_destroy_ctx(dst);
}
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 98e734d77d51..563fbc7e3f44 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -132,22 +132,35 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
unsigned long *expected, int nr_expected)
{
struct damon_target *t;
+ struct damon_addr_range *ranges;
struct damon_region *r;
int i;
t = damon_new_target();
if (!t)
kunit_skip(test, "target alloc fail");
+
+ ranges = kmalloc_array(nr_regions / 2, sizeof(*ranges), GFP_KERNEL);
+ if (!ranges) {
+ damon_destroy_target(t, NULL);
+ kunit_skip(test, "ranges alloc fail");
+ }
for (i = 0; i < nr_regions / 2; i++) {
- r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
- if (!r) {
- damon_destroy_target(t, NULL);
- kunit_skip(test, "region alloc fail");
- }
- damon_add_region(r, t);
+ ranges[i].start = regions[i * 2];
+ ranges[i].end = regions[i * 2 + 1];
}
+ if (damon_set_regions(t, ranges, nr_regions / 2,
+ DAMON_MIN_REGION_SZ)) {
+ kfree(ranges);
+ damon_destroy_target(t, NULL);
+ kunit_skip(test, "damon_set_regions() fail");
+ }
+ kfree(ranges);
- damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ);
+ if (damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ)) {
+ damon_destroy_target(t, NULL);
+ kunit_skip(test, "second damon_set_regions() fail");
+ }
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b069dbc7e3d2..d27147603564 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -237,6 +237,35 @@ static void damon_va_update(struct damon_ctx *ctx)
}
}
+static void damon_va_walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mm_walk_ops *ops, void *private)
+{
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, start);
+ if (!vma)
+ goto lock_mmap;
+
+ if (end > vma->vm_end) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+
+ if (!(vma->vm_flags & VM_PFNMAP)) {
+ ops->walk_lock = PGWALK_VMA_RDLOCK_VERIFY;
+ walk_page_range_vma(vma, start, end, ops, private);
+ }
+
+ vma_end_read(vma);
+ return;
+
+lock_mmap:
+ mmap_read_lock(mm);
+ ops->walk_lock = PGWALK_RDLOCK;
+ walk_page_range(mm, start, end, ops, private);
+ mmap_read_unlock(mm);
+}
+
static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
@@ -315,17 +344,14 @@ out:
#define damon_mkold_hugetlb_entry NULL
#endif /* CONFIG_HUGETLB_PAGE */
-static const struct mm_walk_ops damon_mkold_ops = {
- .pmd_entry = damon_mkold_pmd_entry,
- .hugetlb_entry = damon_mkold_hugetlb_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
{
- mmap_read_lock(mm);
- walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
- mmap_read_unlock(mm);
+ struct mm_walk_ops damon_mkold_ops = {
+ .pmd_entry = damon_mkold_pmd_entry,
+ .hugetlb_entry = damon_mkold_hugetlb_entry,
+ };
+
+ damon_va_walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL);
}
/*
@@ -333,9 +359,10 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
*/
static void __damon_va_prepare_access_check(struct mm_struct *mm,
- struct damon_region *r)
+ struct damon_region *r,
+ struct damon_ctx *ctx)
{
- r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+ r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end);
damon_va_mkold(mm, r->sampling_addr);
}
@@ -351,7 +378,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t)
- __damon_va_prepare_access_check(mm, r);
+ __damon_va_prepare_access_check(mm, r, ctx);
mmput(mm);
}
}
@@ -444,12 +471,6 @@ out:
#define damon_young_hugetlb_entry NULL
#endif /* CONFIG_HUGETLB_PAGE */
-static const struct mm_walk_ops damon_young_ops = {
- .pmd_entry = damon_young_pmd_entry,
- .hugetlb_entry = damon_young_hugetlb_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
unsigned long *folio_sz)
{
@@ -458,9 +479,12 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
.young = false,
};
- mmap_read_lock(mm);
- walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
- mmap_read_unlock(mm);
+ struct mm_walk_ops damon_young_ops = {
+ .pmd_entry = damon_young_pmd_entry,
+ .hugetlb_entry = damon_young_hugetlb_entry,
+ };
+
+ damon_va_walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg);
return arg.young;
}
@@ -749,7 +773,6 @@ static unsigned long damos_va_migrate(struct damon_target *target,
struct mm_walk_ops walk_ops = {
.pmd_entry = damos_va_migrate_pmd_entry,
.pte_entry = NULL,
- .walk_lock = PGWALK_RDLOCK,
};
use_target_nid = dests->nr_dests == 0;
@@ -767,9 +790,7 @@ static unsigned long damos_va_migrate(struct damon_target *target,
if (!mm)
goto free_lists;
- mmap_read_lock(mm);
- walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
- mmap_read_unlock(mm);
+ damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
mmput(mm);
for (int i = 0; i < nr_dests; i++) {
@@ -861,7 +882,6 @@ static unsigned long damos_va_stat(struct damon_target *target,
struct mm_struct *mm;
struct mm_walk_ops walk_ops = {
.pmd_entry = damos_va_stat_pmd_entry,
- .walk_lock = PGWALK_RDLOCK,
};
priv.scheme = s;
@@ -874,9 +894,7 @@ static unsigned long damos_va_stat(struct damon_target *target,
if (!mm)
return 0;
- mmap_read_lock(mm);
- walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
- mmap_read_unlock(mm);
+ damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv);
mmput(mm);
return 0;
}
@@ -903,6 +921,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
+ case DAMOS_COLLAPSE:
+ madv_action = MADV_COLLAPSE;
+ break;
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
return damos_va_migrate(t, r, scheme, sz_filter_passed);
diff --git a/mm/filemap.c b/mm/filemap.c
index 179f2886f8c0..7e467c81d213 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1808,9 +1808,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
- unsigned long nr = max_scan;
- while (nr--) {
+ while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
return xas.xa_index;
@@ -1818,7 +1817,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
return 0;
}
- return index + max_scan;
+ /* Return end of the range + 1 when no hole is found */
+ return xas.xa_index + 1;
}
EXPORT_SYMBOL(page_cache_next_miss);
@@ -1849,12 +1849,13 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_prev(&xas);
if (!entry || xa_is_value(entry))
- break;
+ return xas.xa_index;
if (xas.xa_index == ULONG_MAX)
- break;
+ return ULONG_MAX;
}
- return xas.xa_index;
+ /* Return start of the range - 1 when no hole is found */
+ return xas.xa_index - 1;
}
EXPORT_SYMBOL(page_cache_prev_miss);
@@ -2294,8 +2295,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
goto put_folio;
if (!folio_batch_add(fbatch, folio)) {
- nr = folio_nr_pages(folio);
- *start = folio->index + nr;
+ *start = folio_next_index(folio);
goto out;
}
xas_advance(&xas, folio_next_index(folio) - 1);
@@ -2355,8 +2355,7 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
if (xa_is_value(folio))
continue;
if (!folio_batch_add(fbatch, folio)) {
- unsigned long nr = folio_nr_pages(folio);
- *start = folio->index + nr;
+ *start = folio_next_index(folio);
goto out;
}
}
@@ -2414,8 +2413,7 @@ unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
}
}
if (!folio_batch_add(fbatch, folio)) {
- unsigned long nr = folio_nr_pages(folio);
- *start = folio->index + nr;
+ *start = folio_next_index(folio);
goto out;
}
}
@@ -3323,12 +3321,26 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *fpin = NULL;
vm_flags_t vm_flags = vmf->vma->vm_flags;
bool force_thp_readahead = false;
+ unsigned int thp_order = 0;
unsigned short mmap_miss;
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
+
/* Use the readahead code, even if readahead is disabled */
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
- force_thp_readahead = true;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE)) {
+ /*
+ * Cap max THP order at 2MB: this is the common PMD-sized
+ * hugepage size, and it avoids memory pressure from very
+ * large forced readahead when mapping_max_folio_order() is
+ * high (for example, 128MB with 64K base pages on arm64).
+ */
+ if (mapping_large_folio_support(mapping)) {
+ force_thp_readahead = true;
+ thp_order = min_t(unsigned int,
+ mapping_max_folio_order(mapping),
+ get_order(SZ_2M));
+ }
+ }
if (!force_thp_readahead) {
/*
@@ -3348,7 +3360,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
}
- if (!(vm_flags & VM_SEQ_READ)) {
+ if (!(vm_flags & (VM_SEQ_READ | VM_EXEC))) {
/* Avoid banging the cache line if not needed */
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss < MMAP_LOTSAMISS * 10)
@@ -3363,17 +3375,19 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
if (force_thp_readahead) {
+ unsigned long folio_nr_pages = 1UL << thp_order;
+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
- ra->size = HPAGE_PMD_NR;
+ ractl._index &= ~(folio_nr_pages - 1);
+ ra->size = folio_nr_pages;
/*
- * Fetch two PMD folios, so we get the chance to actually
+ * Fetch two folios so we get the chance to actually
* readahead, unless we've been told not to.
*/
if (!(vm_flags & VM_RAND_READ))
ra->size *= 2;
- ra->async_size = HPAGE_PMD_NR;
- ra->order = HPAGE_PMD_ORDER;
+ ra->async_size = folio_nr_pages;
+ ra->order = thp_order;
page_cache_ra_order(&ractl, ra);
return fpin;
}
@@ -3407,6 +3421,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->start = max(ra->start, vmf->vma->vm_pgoff);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra->order = 0;
@@ -3441,14 +3456,20 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
* Don't touch the mmap_miss counter to avoid decreasing it multiple
* times for a single folio and break the balance with mmap_miss
* increase in do_sync_mmap_readahead().
+ *
+ * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss increment in
+ * do_sync_mmap_readahead(), so skip the decrement here as well to
+ * keep the counter symmetric.
*/
- if (likely(!folio_test_locked(folio))) {
+ if (likely(!folio_test_locked(folio)) &&
+ !(vmf->vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) {
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss)
WRITE_ONCE(ra->mmap_miss, --mmap_miss);
}
if (folio_test_readahead(folio)) {
+ ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1;
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_ra(&ractl, folio, ra->ra_pages);
}
@@ -3758,8 +3779,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned short *mmap_miss,
- pgoff_t file_end)
+ unsigned long *rss, pgoff_t file_end)
{
struct address_space *mapping = folio->mapping;
unsigned int ref_from_caller = 1;
@@ -3792,16 +3812,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
goto skip;
/*
- * If there are too many folios that are recently evicted
- * in a file, they will probably continue to be evicted.
- * In such situation, read-ahead is only a waste of IO.
- * Don't decrease mmap_miss in this scenario to make sure
- * we can stop read-ahead.
- */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
- /*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
@@ -3847,7 +3857,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned long *rss, unsigned short *mmap_miss)
+ unsigned long *rss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3855,10 +3865,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
if (PageHWPoison(page))
goto out;
- /* See comment of filemap_map_folio_range() */
- if (!folio_test_workingset(folio))
- (*mmap_miss)++;
-
/*
* NOTE: If there're PTE markers, we'll leave them to be
* handled in the specific fault path, and it'll prohibit
@@ -3893,7 +3899,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
vm_fault_t ret = 0;
unsigned long rss = 0;
unsigned int nr_pages = 0, folio_type;
- unsigned short mmap_miss = 0, mmap_miss_saved;
/*
* Recalculate end_pgoff based on file_end before calling
@@ -3932,6 +3937,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
folio_type = mm_counter_file(folio);
do {
unsigned long end;
+ vm_fault_t map_ret;
addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
@@ -3939,13 +3945,40 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
end = folio_next_index(folio) - 1;
nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
- if (!folio_test_large(folio))
- ret |= filemap_map_order0_folio(vmf,
- folio, addr, &rss, &mmap_miss);
- else
- ret |= filemap_map_folio_range(vmf, folio,
- xas.xa_index - folio->index, addr,
- nr_pages, &rss, &mmap_miss, file_end);
+ if (!folio_test_large(folio)) {
+ map_ret = filemap_map_order0_folio(vmf, folio, addr,
+ &rss);
+ } else {
+ unsigned long start = xas.xa_index - folio->index;
+
+ map_ret = filemap_map_folio_range(vmf, folio, start,
+ addr, nr_pages, &rss,
+ file_end);
+ }
+ ret |= map_ret;
+
+ /*
+ * If there are too many folios that are recently evicted
+ * in a file, they will probably continue to be evicted.
+ * In such situation, read-ahead is only a waste of IO.
+ * Don't decrease mmap_miss in this scenario to make sure
+ * we can stop read-ahead.
+ *
+ * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss
+ * increment in do_sync_mmap_readahead(), so skip the
+ * decrement here as well to keep the counter symmetric.
+ */
+ if ((map_ret & VM_FAULT_NOPAGE) &&
+ !(vmf->flags & FAULT_FLAG_TRIED) &&
+ !folio_test_workingset(folio) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) {
+ unsigned short mmap_miss;
+
+ mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(file->f_ra.mmap_miss,
+ mmap_miss - 1);
+ }
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
@@ -3955,12 +3988,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
out:
rcu_read_unlock();
- mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
- if (mmap_miss >= mmap_miss_saved)
- WRITE_ONCE(file->f_ra.mmap_miss, 0);
- else
- WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
-
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/gup.c b/mm/gup.c
index ad9ded39609c..0692119b7904 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!folio)
goto pte_unmap;
- if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
- unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) ||
+ unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d29e85495091..64492dcb9d1e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -14,6 +14,7 @@
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
+#include <linux/list_lru.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
@@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+static struct lock_class_key deferred_split_key;
+static struct list_lru deferred_split_lru;
static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc);
@@ -429,61 +432,75 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
return count;
}
+enum defrag_mode {
+ DEFRAG_ALWAYS = 0,
+ DEFRAG_DEFER,
+ DEFRAG_DEFER_MADVISE,
+ DEFRAG_MADVISE,
+ DEFRAG_NEVER,
+};
+
+static const char * const defrag_mode_strings[] = {
+ [DEFRAG_ALWAYS] = "always",
+ [DEFRAG_DEFER] = "defer",
+ [DEFRAG_DEFER_MADVISE] = "defer+madvise",
+ [DEFRAG_MADVISE] = "madvise",
+ [DEFRAG_NEVER] = "never",
+};
+
+static const enum transparent_hugepage_flag defrag_flags[] = {
+ [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+ [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+};
+
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- const char *output;
+ int active = DEFRAG_NEVER;
+ int len = 0;
+ int i;
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- &transparent_hugepage_flags))
- output = "[always] defer defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- &transparent_hugepage_flags))
- output = "always [defer] defer+madvise madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer [defer+madvise] madvise never";
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
- &transparent_hugepage_flags))
- output = "always defer defer+madvise [madvise] never";
- else
- output = "always defer defer+madvise madvise [never]";
+ for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) {
+ if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) {
+ active = i;
+ break;
+ }
+ }
- return sysfs_emit(buf, "%s\n", output);
+ for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) {
+ if (i == active)
+ len += sysfs_emit_at(buf, len, "[%s] ",
+ defrag_mode_strings[i]);
+ else
+ len += sysfs_emit_at(buf, len, "%s ",
+ defrag_mode_strings[i]);
+ }
+
+ /* Replace trailing space with newline */
+ buf[len - 1] = '\n';
+
+ return len;
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (sysfs_streq(buf, "always")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer+madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "defer")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "never")) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else
+ int mode, m;
+
+ mode = sysfs_match_string(defrag_mode_strings, buf);
+ if (mode < 0)
return -EINVAL;
+ for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) {
+ if (m == mode)
+ set_bit(defrag_flags[m], &transparent_hugepage_flags);
+ else
+ clear_bit(defrag_flags[m], &transparent_hugepage_flags);
+ }
+
return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
@@ -918,15 +935,28 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
}
#endif /* CONFIG_SYSFS */
+int folio_memcg_alloc_deferred(struct folio *folio)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ return folio_memcg_list_lru_alloc(folio, &deferred_split_lru, GFP_KERNEL);
+}
+
static int __init thp_shrinker_init(void)
{
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
- SHRINKER_MEMCG_AWARE |
- SHRINKER_NONSLAB,
+ SHRINKER_MEMCG_AWARE,
"thp-deferred_split");
if (!deferred_split_shrinker)
return -ENOMEM;
+ if (list_lru_init_memcg_key(&deferred_split_lru,
+ deferred_split_shrinker,
+ &deferred_split_key)) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
@@ -948,6 +978,7 @@ static int __init thp_shrinker_init(void)
huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
if (!huge_zero_folio_shrinker) {
shrinker_free(deferred_split_shrinker);
+ list_lru_destroy(&deferred_split_lru);
return -ENOMEM;
}
@@ -962,6 +993,7 @@ static void __init thp_shrinker_exit(void)
{
shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
+ list_lru_destroy(&deferred_split_lru);
}
static int __init hugepage_init(void)
@@ -1141,119 +1173,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static struct deferred_split *split_queue_node(int nid)
-{
- struct pglist_data *pgdata = NODE_DATA(nid);
-
- return &pgdata->deferred_split_queue;
-}
-
-#ifdef CONFIG_MEMCG
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
- struct deferred_split *queue)
-{
- if (mem_cgroup_disabled())
- return NULL;
- if (split_queue_node(folio_nid(folio)) == queue)
- return NULL;
- return container_of(queue, struct mem_cgroup, deferred_split_queue);
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
- return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
-}
-#else
-static inline
-struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
- struct deferred_split *queue)
-{
- return NULL;
-}
-
-static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
-{
- return split_queue_node(nid);
-}
-#endif
-
-static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
-{
- struct deferred_split *queue;
-
-retry:
- queue = memcg_split_queue(nid, memcg);
- spin_lock(&queue->split_queue_lock);
- /*
- * There is a period between setting memcg to dying and reparenting
- * deferred split queue, and during this period the THPs in the deferred
- * split queue will be hidden from the shrinker side.
- */
- if (unlikely(memcg_is_dying(memcg))) {
- spin_unlock(&queue->split_queue_lock);
- memcg = parent_mem_cgroup(memcg);
- goto retry;
- }
-
- return queue;
-}
-
-static struct deferred_split *
-split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
-{
- struct deferred_split *queue;
-
-retry:
- queue = memcg_split_queue(nid, memcg);
- spin_lock_irqsave(&queue->split_queue_lock, *flags);
- if (unlikely(memcg_is_dying(memcg))) {
- spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
- memcg = parent_mem_cgroup(memcg);
- goto retry;
- }
-
- return queue;
-}
-
-static struct deferred_split *folio_split_queue_lock(struct folio *folio)
-{
- struct deferred_split *queue;
-
- rcu_read_lock();
- queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
- /*
- * The memcg destruction path is acquiring the split queue lock for
- * reparenting. Once you have it locked, it's safe to drop the rcu lock.
- */
- rcu_read_unlock();
-
- return queue;
-}
-
-static struct deferred_split *
-folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
-{
- struct deferred_split *queue;
-
- rcu_read_lock();
- queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
- rcu_read_unlock();
-
- return queue;
-}
-
-static inline void split_queue_unlock(struct deferred_split *queue)
-{
- spin_unlock(&queue->split_queue_lock);
-}
-
-static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
- unsigned long flags)
-{
- spin_unlock_irqrestore(&queue->split_queue_lock, flags);
-}
-
static inline bool is_transparent_hugepage(const struct folio *folio)
{
if (!folio_test_large(folio))
@@ -1354,6 +1273,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
return NULL;
}
+
+ if (folio_memcg_alloc_deferred(folio)) {
+ folio_put(folio);
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+
folio_throttle_swaprate(folio, gfp);
/*
@@ -2638,6 +2565,8 @@ static void change_non_present_huge_pmd(struct mm_struct *mm,
} else if (softleaf_is_device_private_write(entry)) {
entry = make_readable_device_private_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
} else {
newpmd = *pmd;
}
@@ -3890,34 +3819,43 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
struct folio *end_folio = folio_next(folio);
struct folio *new_folio, *next;
int old_order = folio_order(folio);
+ struct list_lru_one *lru;
+ bool dequeue_deferred;
int ret = 0;
- struct deferred_split *ds_queue;
VM_WARN_ON_ONCE(!mapping && end);
- /* Prevent deferred_split_scan() touching ->_refcount */
- ds_queue = folio_split_queue_lock(folio);
+ /*
+ * If this folio can be on the deferred split queue, lock out
+ * the shrinker before freezing the ref. If the shrinker sees
+ * a 0-ref folio, it assumes it beat folio_put() to the list
+ * lock and must clean up the LRU state - the same dequeue we
+ * will do below as part of the split.
+ */
+ dequeue_deferred = folio_test_anon(folio) && old_order > 1;
+ if (dequeue_deferred) {
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ lru = list_lru_lock(&deferred_split_lru,
+ folio_nid(folio), &memcg);
+ }
if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
- if (old_order > 1) {
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- /*
- * Reinitialize page_deferred_list after removing the
- * page from the split_queue, otherwise a subsequent
- * split will see list corruption when checking the
- * page_deferred_list.
- */
- list_del_init(&folio->_deferred_list);
- }
+ if (dequeue_deferred) {
+ __list_lru_del(&deferred_split_lru, lru,
+ &folio->_deferred_list, folio_nid(folio));
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(old_order,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
+ list_lru_unlock(lru);
+ rcu_read_unlock();
}
- split_queue_unlock(ds_queue);
+
if (mapping) {
int nr = folio_nr_pages(folio);
@@ -4018,7 +3956,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
if (ci)
swap_cluster_unlock(ci);
} else {
- split_queue_unlock(ds_queue);
+ if (dequeue_deferred) {
+ list_lru_unlock(lru);
+ rcu_read_unlock();
+ }
return -EAGAIN;
}
@@ -4193,11 +4134,10 @@ fail:
folio_unlock(new_folio);
/*
- * Subpages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
+ * Subpages whose mapping has been zapped may be freed
+ * earlier, but freeing them requires taking the
+ * lru_lock, so we defer put_page() on tail pages until
+ * after the split completes.
*/
free_folio_and_swap_cache(new_folio);
}
@@ -4385,33 +4325,37 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
* queueing THP splits, and that list is (racily observed to be) non-empty.
*
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
- * zero: because even when split_queue_lock is held, a non-empty _deferred_list
- * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ * zero: because even when the list_lru lock is held, a non-empty
+ * _deferred_list might be in use on deferred_split_scan()'s unlocked
+ * on-stack list.
*
- * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
- * therefore important to unqueue deferred split before changing folio memcg.
+ * The list_lru sublist is determined by folio's memcg: it is therefore
+ * important to unqueue deferred split before changing folio memcg.
*/
bool __folio_unqueue_deferred_split(struct folio *folio)
{
- struct deferred_split *ds_queue;
+ struct mem_cgroup *memcg;
+ struct list_lru_one *lru;
+ int nid = folio_nid(folio);
unsigned long flags;
bool unqueued = false;
WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags);
+ if (__list_lru_del(&deferred_split_lru, lru, &folio->_deferred_list, nid)) {
if (folio_test_partially_mapped(folio)) {
folio_clear_partially_mapped(folio);
mod_mthp_stat(folio_order(folio),
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
- list_del_init(&folio->_deferred_list);
unqueued = true;
}
- split_queue_unlock_irqrestore(ds_queue, flags);
+ list_lru_unlock_irqrestore(lru, &flags);
+ rcu_read_unlock();
return unqueued; /* useful for debug warnings */
}
@@ -4419,7 +4363,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
- struct deferred_split *ds_queue;
+ struct list_lru_one *lru;
+ int nid;
+ struct mem_cgroup *memcg;
unsigned long flags;
/*
@@ -4434,7 +4380,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/*
* Exclude swapcache: originally to avoid a corrupt deferred split
- * queue. Nowadays that is fully prevented by memcg1_swapout();
+ * queue. Nowadays that is fully prevented by __memcg1_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
@@ -4442,7 +4388,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
if (folio_test_swapcache(folio))
return;
- ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
+ nid = folio_nid(folio);
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio);
@@ -4450,36 +4400,23 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
-
}
} else {
/* partially mapped folios cannot become non-partially mapped */
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
- if (list_empty(&folio->_deferred_list)) {
- struct mem_cgroup *memcg;
-
- memcg = folio_split_queue_memcg(folio, ds_queue);
- list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
- ds_queue->split_queue_len++;
- if (memcg)
- set_shrinker_bit(memcg, folio_nid(folio),
- shrinker_id(deferred_split_shrinker));
- }
- split_queue_unlock_irqrestore(ds_queue, flags);
+ __list_lru_add(&deferred_split_lru, lru, &folio->_deferred_list, nid, memcg);
+ list_lru_unlock_irqrestore(lru, &flags);
+ rcu_read_unlock();
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct pglist_data *pgdata = NODE_DATA(sc->nid);
- struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+ unsigned long count;
-#ifdef CONFIG_MEMCG
- if (sc->memcg)
- ds_queue = &sc->memcg->deferred_split_queue;
-#endif
- return READ_ONCE(ds_queue->split_queue_len);
+ count = list_lru_shrink_count(&deferred_split_lru, sc);
+ return count ?: SHRINK_EMPTY;
}
static bool thp_underused(struct folio *folio)
@@ -4509,45 +4446,49 @@ static bool thp_underused(struct folio *folio)
return false;
}
+static enum lru_status deferred_split_isolate(struct list_head *item,
+ struct list_lru_one *lru,
+ void *cb_arg)
+{
+ struct folio *folio = container_of(item, struct folio, _deferred_list);
+ struct list_head *freeable = cb_arg;
+
+ if (folio_try_get(folio)) {
+ list_lru_isolate_move(lru, item, freeable);
+ return LRU_REMOVED;
+ }
+
+ /*
+ * We lost race with folio_put(). Read folio state before the
+ * isolate: folio_unqueue_deferred_split() checks list_empty()
+ * locklessly, so once removed the folio can be freed any time.
+ */
+ if (folio_test_partially_mapped(folio)) {
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
+ list_lru_isolate(lru, item);
+ return LRU_REMOVED;
+}
+
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct deferred_split *ds_queue;
- unsigned long flags;
+ LIST_HEAD(dispose);
struct folio *folio, *next;
- int split = 0, i;
- struct folio_batch fbatch;
-
- folio_batch_init(&fbatch);
+ int split = 0;
+ unsigned long isolated;
-retry:
- ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
- /* Take pin on all head pages to avoid freeing them under us */
- list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
- _deferred_list) {
- if (folio_try_get(folio)) {
- folio_batch_add(&fbatch, folio);
- } else if (folio_test_partially_mapped(folio)) {
- /* We lost race with folio_put() */
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
- if (!--sc->nr_to_scan)
- break;
- if (!folio_batch_space(&fbatch))
- break;
- }
- split_queue_unlock_irqrestore(ds_queue, flags);
+ isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
+ deferred_split_isolate, &dispose);
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
bool did_split = false;
bool underused = false;
- struct deferred_split *fqueue;
- folio = fbatch.folios[i];
+ list_del_init(&folio->_deferred_list);
+
if (!folio_test_partially_mapped(folio)) {
/*
* See try_to_map_unused_to_zeropage(): we cannot
@@ -4576,63 +4517,23 @@ next:
* underused, then consider it used and don't add it back to
* split_queue.
*/
- if (did_split || !folio_test_partially_mapped(folio))
- continue;
+ if (!did_split && folio_test_partially_mapped(folio)) {
requeue:
- /*
- * Add back partially mapped folios, or underused folios that
- * we could not lock this round.
- */
- fqueue = folio_split_queue_lock_irqsave(folio, &flags);
- if (list_empty(&folio->_deferred_list)) {
- list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
- fqueue->split_queue_len++;
+ rcu_read_lock();
+ list_lru_add_irq(&deferred_split_lru,
+ &folio->_deferred_list,
+ folio_nid(folio),
+ folio_memcg(folio));
+ rcu_read_unlock();
}
- split_queue_unlock_irqrestore(fqueue, flags);
- }
- folios_put(&fbatch);
-
- if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
- cond_resched();
- goto retry;
+ folio_put(folio);
}
- /*
- * Stop shrinker if we didn't split any page, but the queue is empty.
- * This can happen if pages were freed under us.
- */
- if (!split && list_empty(&ds_queue->split_queue))
+ if (!split && !isolated)
return SHRINK_STOP;
return split;
}
-#ifdef CONFIG_MEMCG
-void reparent_deferred_split_queue(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct deferred_split *ds_queue = &memcg->deferred_split_queue;
- struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
- int nid;
-
- spin_lock_irq(&ds_queue->split_queue_lock);
- spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
-
- if (!ds_queue->split_queue_len)
- goto unlock;
-
- list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
- parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
- ds_queue->split_queue_len = 0;
-
- for_each_node(nid)
- set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
-
-unlock:
- spin_unlock(&parent_ds_queue->split_queue_lock);
- spin_unlock_irq(&ds_queue->split_queue_lock);
-}
-#endif
-
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c921287489de..571212b80835 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2862,6 +2862,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct hugetlb_cgroup *h_cg_rsvd = NULL;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
idx = hstate_index(h);
@@ -2912,7 +2913,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
- idx, pages_per_huge_page(h), &h_cg);
+ idx, pages_per_huge_page(h), &h_cg_rsvd);
if (ret)
goto out_subpool_put;
}
@@ -2954,7 +2955,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, folio);
+ h_cg_rsvd, folio);
}
spin_unlock_irq(&hugetlb_lock);
@@ -3006,7 +3007,7 @@ out_uncharge_cgroup:
out_uncharge_cgroup_reservation:
if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
- h_cg);
+ h_cg_rsvd);
out_subpool_put:
/*
* put page to subpool iff the quota of subpool's rsv_hpages is used
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index 7693ccefd0c6..39344d6c78d8 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -142,7 +142,7 @@ unsigned int __weak arch_hugetlb_cma_order(void)
void __init hugetlb_cma_reserve(void)
{
- unsigned long size, reserved, per_node, order;
+ unsigned long size, reserved, per_node, order, gigantic_page_size;
bool node_specific_cma_alloc = false;
int nid;
@@ -162,37 +162,36 @@ void __init hugetlb_cma_reserve(void)
* breaking this assumption.
*/
VM_WARN_ON(order <= MAX_PAGE_ORDER);
+ gigantic_page_size = PAGE_SIZE << order;
hugetlb_bootmem_set_nodes();
for (nid = 0; nid < MAX_NUMNODES; nid++) {
- if (hugetlb_cma_size_in_node[nid] == 0)
+ size = hugetlb_cma_size_in_node[nid];
+ if (size == 0)
continue;
if (!node_isset(nid, hugetlb_bootmem_nodes)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- continue;
- }
-
- if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
- nid, (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
+ } else if (!IS_ALIGNED(size, gigantic_page_size)) {
+ pr_warn("hugetlb_cma: cma area of node %d must be a multiple of %lu MiB\n",
+ nid, gigantic_page_size / SZ_1M);
} else {
node_specific_cma_alloc = true;
+ continue;
}
+
+ hugetlb_cma_size -= size;
+ hugetlb_cma_size_in_node[nid] = 0;
}
/* Validate the CMA size again in case some invalid nodes specified. */
if (!hugetlb_cma_size)
return;
- if (hugetlb_cma_size < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
- (PAGE_SIZE << order) / SZ_1M);
+ if (!IS_ALIGNED(hugetlb_cma_size, gigantic_page_size)) {
+ pr_warn("hugetlb_cma: cma area must be a multiple of %lu MiB\n",
+ gigantic_page_size / SZ_1M);
hugetlb_cma_size = 0;
return;
}
@@ -204,7 +203,7 @@ void __init hugetlb_cma_reserve(void)
*/
per_node = DIV_ROUND_UP(hugetlb_cma_size,
nodes_weight(hugetlb_bootmem_nodes));
- per_node = round_up(per_node, PAGE_SIZE << order);
+ per_node = round_up(per_node, gigantic_page_size);
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
}
@@ -223,15 +222,13 @@ void __init hugetlb_cma_reserve(void)
size = min(per_node, hugetlb_cma_size - reserved);
}
- size = round_up(size, PAGE_SIZE << order);
-
snprintf(name, sizeof(name), "hugetlb%d", nid);
/*
* Note that 'order per bit' is based on smallest size that
* may be returned to CMA allocator in the case of
* huge page demotion.
*/
- res = cma_declare_contiguous_multi(size, PAGE_SIZE << order,
+ res = cma_declare_contiguous_multi(size, gigantic_page_size,
HUGETLB_PAGE_ORDER, name,
&hugetlb_cma[nid], nid);
if (res) {
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..181e79f1d6a2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -17,7 +17,6 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/leafops.h>
-#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>
/* Internal core VMA manipulation functions. */
@@ -451,24 +450,16 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
pte_t expected_pte = pte_next_swp_offset(pte);
const pte_t *end_ptep = start_ptep + max_nr;
- const softleaf_t entry = softleaf_from_pte(pte);
pte_t *ptep = start_ptep + 1;
- unsigned short cgroup_id;
VM_WARN_ON(max_nr < 1);
- VM_WARN_ON(!softleaf_is_swap(entry));
+ VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte)));
- cgroup_id = lookup_swap_cgroup_id(entry);
while (ptep < end_ptep) {
- softleaf_t entry;
-
pte = ptep_get(ptep);
if (!pte_same(pte, expected_pte))
break;
- entry = softleaf_from_pte(pte);
- if (lookup_swap_cgroup_id(entry) != cgroup_id)
- break;
expected_pte = pte_next_swp_offset(expected_pte);
ptep++;
}
@@ -861,7 +852,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
/*
* At this point, there is no one trying to add the folio to
* deferred_list. If folio is not in deferred_list, it's safe
- * to check without acquiring the split_queue_lock.
+ * to check without acquiring the list_lru lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
return false;
@@ -1104,9 +1095,17 @@ static inline void init_cma_pageblock(struct page *page)
}
#endif
-
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool claimable);
+enum fallback_result {
+ /* Found suitable migratetype, *mt_out is valid. */
+ FALLBACK_FOUND,
+ /* No fallback found in requested order. */
+ FALLBACK_EMPTY,
+ /* Passed @claimable, but claiming whole block is a bad idea. */
+ FALLBACK_NOCLAIM,
+};
+enum fallback_result
+find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool claimable, int *mt_out);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index e41ba69592ef..b9e167ed5be3 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -874,6 +874,16 @@ static void kmalloc_double_kzfree(struct kunit *test)
char *ptr;
size_t size = 16;
+ /*
+ * With the tag-based KASAN modes, if the memory happens to be
+ * reallocated between the two frees and the new allocation tag happens
+ * to match the old one, the second free will cause a memory corruption.
+ * Resolving https://bugzilla.kernel.org/show_bug.cgi?id=212177 would
+ * help to deal with this. With Generic KASAN, it's effectively
+ * impossible for the memory to get reallocated due to the quarantine.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index c6048f57bae9..de2d0f7d62b1 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
break;
}
- kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp,
policy_name, !!test_cache);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043..73e262cb30dd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -437,13 +437,16 @@ void __khugepaged_enter(struct mm_struct *mm)
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(collapse_test_exit(mm), mm);
- if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
- return;
slot = mm_slot_alloc(mm_slot_cache);
if (!slot)
return;
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) {
+ mm_slot_free(mm_slot_cache, slot);
+ return;
+ }
+
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -1120,6 +1123,11 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
if (result != SCAN_SUCCEED)
goto out_nolock;
+ if (folio_memcg_alloc_deferred(folio)) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
+
mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
if (result != SCAN_SUCCEED) {
@@ -2528,8 +2536,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
cc->progress++;
continue;
}
- hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
- hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
+ hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend) {
cc->progress++;
continue;
@@ -2808,6 +2816,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
case SCAN_PAGE_FILLED:
+ case SCAN_PAGE_HAS_PRIVATE:
case SCAN_PAGE_DIRTY_OR_WRITEBACK:
return -EAGAIN;
/*
@@ -2845,8 +2854,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmgrab(mm);
lru_add_drain_all();
- hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = end & HPAGE_PMD_MASK;
+ hstart = ALIGN(start, HPAGE_PMD_SIZE);
+ hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE);
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
enum scan_result result = SCAN_FAIL;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2eff0d6b622b..7c7ba17ce7af 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,6 +92,7 @@
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
+#include <linux/xarray.h>
#include <linux/crc32.h>
#include <asm/sections.h>
@@ -157,6 +158,8 @@ struct kmemleak_object {
struct hlist_head area_list;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
+ /* per-scan dedup count, valid only while in scan-local dedup xarray */
+ unsigned int dup_count;
char comm[TASK_COMM_LEN]; /* executable name */
};
@@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object)
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
*/
-static void print_unreferenced(struct seq_file *seq,
- struct kmemleak_object *object)
+static void __print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object,
+ bool hex_dump)
{
int i;
unsigned long *entries;
@@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq,
object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
- hex_dump_object(seq, object);
+ if (hex_dump)
+ hex_dump_object(seq, object);
warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum);
for (i = 0; i < nr_entries; i++) {
@@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq,
}
}
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ __print_unreferenced(seq, object, true);
+}
+
/*
* Print the kmemleak_object information. This function is used mainly for
* debugging special cases when kmemleak operations. It must be called with
@@ -1685,6 +1696,103 @@ unlock_put:
}
/*
+ * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it
+ * does not touch user memory that was freed concurrently; the rest of the
+ * report (backtrace, comm, pid) is always emitted since the kmemleak_object
+ * metadata is pinned by the caller.
+ */
+static void print_leak_locked(struct kmemleak_object *object, bool hex_dump)
+{
+ raw_spin_lock_irq(&object->lock);
+ __print_unreferenced(NULL, object,
+ hex_dump && (object->flags & OBJECT_ALLOCATED));
+ raw_spin_unlock_irq(&object->lock);
+}
+
+/*
+ * Per-scan dedup table for verbose leak printing. The xarray is keyed by
+ * stackdepot trace_handle and stores a pointer to the representative
+ * kmemleak_object. The per-scan repeat count lives in object->dup_count.
+ *
+ * dedup_record() must run outside object->lock: xa_store() may take
+ * mutexes (xa_node slab allocation) which lockdep would flag against the
+ * raw spinlock object->lock.
+ */
+static void dedup_record(struct xarray *dedup, struct kmemleak_object *object,
+ depot_stack_handle_t trace_handle)
+{
+ struct kmemleak_object *rep;
+ void *old;
+
+ /*
+ * No stack trace to dedup against: early-boot allocation tracked
+ * before kmemleak_init() set up object_cache, or stack_depot_save()
+ * failure under memory pressure.
+ */
+ if (!trace_handle) {
+ print_leak_locked(object, true);
+ return;
+ }
+
+ /* stack is available, now we can de-dup */
+ rep = xa_load(dedup, trace_handle);
+ if (rep) {
+ rep->dup_count++;
+ return;
+ }
+
+ /*
+ * Object is being torn down (use_count already hit zero); the
+ * tracked memory at object->pointer is unsafe to read, so skip.
+ */
+ if (!get_object(object))
+ return;
+
+ object->dup_count = 1;
+ old = xa_store(dedup, trace_handle, object, GFP_ATOMIC);
+ if (xa_is_err(old)) {
+ /* xa_node allocation failed; fall back to inline print. */
+ print_leak_locked(object, true);
+ put_object(object);
+ return;
+ }
+ /*
+ * scan_mutex serialises all writers to the dedup xarray, so xa_store()
+ * after a NULL xa_load() must always overwrite an empty slot.
+ */
+ WARN_ON_ONCE(old);
+}
+
+/*
+ * Drain the dedup table. Re-acquires object->lock and re-checks
+ * OBJECT_ALLOCATED before printing: while get_object() pins the
+ * kmemleak_object metadata, the underlying tracked allocation may have
+ * been freed since the scan walked it (kmemleak_free clears
+ * OBJECT_ALLOCATED under object->lock before the user memory goes away).
+ * The hex dump is skipped for coalesced entries since the bytes would
+ * differ across objects anyway.
+ */
+static void dedup_flush(struct xarray *dedup)
+{
+ struct kmemleak_object *object;
+ unsigned long idx;
+ unsigned int dup;
+ bool coalesced;
+
+ xa_for_each(dedup, idx, object) {
+ dup = object->dup_count;
+ coalesced = dup > 1;
+
+ print_leak_locked(object, !coalesced);
+ if (coalesced)
+ pr_warn(" ... and %u more object(s) with the same backtrace\n",
+ dup - 1);
+ put_object(object);
+ xa_erase(dedup, idx);
+ }
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1694,6 +1802,7 @@ static void kmemleak_scan(void)
struct kmemleak_object *object;
struct zone *zone;
int __maybe_unused i;
+ struct xarray dedup;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1834,10 +1943,18 @@ static void kmemleak_scan(void)
return;
/*
- * Scanning result reporting.
+ * Scanning result reporting. When verbose printing is enabled, dedupe
+ * by stackdepot trace_handle so each unique backtrace is logged once
+ * per scan, annotated with the number of objects that share it. The
+ * per-leak count below still reflects every object, and
+ * /sys/kernel/debug/kmemleak still lists them individually.
*/
+ xa_init(&dedup);
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
+ depot_stack_handle_t trace_handle;
+ bool dedup_print;
+
if (need_resched())
kmemleak_cond_resched(object);
@@ -1849,18 +1966,33 @@ static void kmemleak_scan(void)
if (!color_white(object))
continue;
raw_spin_lock_irq(&object->lock);
+ trace_handle = 0;
+ dedup_print = false;
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
-
- if (kmemleak_verbose)
- print_unreferenced(NULL, object);
-
+ if (kmemleak_verbose) {
+ trace_handle = object->trace_handle;
+ dedup_print = true;
+ }
new_leaks++;
}
raw_spin_unlock_irq(&object->lock);
+
+ /*
+ * Defer the verbose print outside object->lock: xa_store()
+ * may take xa_node slab locks at a higher wait-context level
+ * which lockdep would flag against the raw_spinlock_t
+ * object->lock. rcu_read_lock() keeps the kmemleak_object
+ * alive across the call.
+ */
+ if (dedup_print)
+ dedup_record(&dedup, object, trace_handle);
}
rcu_read_unlock();
+ /* Flush'em all */
+ dedup_flush(&dedup);
+ xa_destroy(&dedup);
if (new_leaks) {
kmemleak_found_leaks = true;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9bf7f524796b..36662d02ff96 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -15,6 +15,28 @@
#include "slab.h"
#include "internal.h"
+static inline void lock_list_lru(struct list_lru_one *l, bool irq,
+ unsigned long *irq_flags)
+{
+ if (irq_flags)
+ spin_lock_irqsave(&l->lock, *irq_flags);
+ else if (irq)
+ spin_lock_irq(&l->lock);
+ else
+ spin_lock(&l->lock);
+}
+
+static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off,
+ unsigned long *irq_flags)
+{
+ if (irq_flags)
+ spin_unlock_irqrestore(&l->lock, *irq_flags);
+ else if (irq_off)
+ spin_unlock_irq(&l->lock);
+ else
+ spin_unlock(&l->lock);
+}
+
#ifdef CONFIG_MEMCG
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -60,34 +82,23 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
return &lru->node[nid].lru;
}
-static inline bool lock_list_lru(struct list_lru_one *l, bool irq)
-{
- if (irq)
- spin_lock_irq(&l->lock);
- else
- spin_lock(&l->lock);
- if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) {
- if (irq)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
- return false;
- }
- return true;
-}
-
static inline struct list_lru_one *
-lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
- bool irq, bool skip_empty)
+lock_list_lru_of_memcg(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg, bool irq,
+ unsigned long *irq_flags, bool skip_empty)
{
struct list_lru_one *l;
rcu_read_lock();
again:
- l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
- if (likely(l) && lock_list_lru(l, irq)) {
- rcu_read_unlock();
- return l;
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg));
+ if (likely(l)) {
+ lock_list_lru(l, irq, irq_flags);
+ if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) {
+ rcu_read_unlock();
+ return l;
+ }
+ unlock_list_lru(l, irq, irq_flags);
}
/*
* Caller may simply bail out if raced with reparenting or
@@ -97,18 +108,10 @@ again:
rcu_read_unlock();
return NULL;
}
- VM_WARN_ON(!css_is_dying(&memcg->css));
- memcg = parent_mem_cgroup(memcg);
+ VM_WARN_ON(!css_is_dying(&(*memcg)->css));
+ *memcg = parent_mem_cgroup(*memcg);
goto again;
}
-
-static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
-{
- if (irq_off)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
-}
#else
static void list_lru_register(struct list_lru *lru)
{
@@ -135,52 +138,112 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
}
static inline struct list_lru_one *
-lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
- bool irq, bool skip_empty)
+lock_list_lru_of_memcg(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg, bool irq,
+ unsigned long *irq_flags, bool skip_empty)
{
struct list_lru_one *l = &lru->node[nid].lru;
- if (irq)
- spin_lock_irq(&l->lock);
- else
- spin_lock(&l->lock);
+ lock_list_lru(l, irq, irq_flags);
return l;
}
+#endif /* CONFIG_MEMCG */
-static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
+struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg)
{
- if (irq_off)
- spin_unlock_irq(&l->lock);
- else
- spin_unlock(&l->lock);
+ return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/false,
+ /*irq_flags=*/NULL, /*skip_empty=*/false);
}
-#endif /* CONFIG_MEMCG */
-/* The caller must ensure the memcg lifetime. */
-bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
+void list_lru_unlock(struct list_lru_one *l)
{
- struct list_lru_node *nlru = &lru->node[nid];
- struct list_lru_one *l;
+ unlock_list_lru(l, /*irq_off=*/false, /*irq_flags=*/NULL);
+}
- l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
- if (!l)
- return false;
+struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg)
+{
+ return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true,
+ /*irq_flags=*/NULL, /*skip_empty=*/false);
+}
+
+void list_lru_unlock_irq(struct list_lru_one *l)
+{
+ unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/NULL);
+}
+
+struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid,
+ struct mem_cgroup **memcg,
+ unsigned long *flags)
+{
+ return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true,
+ /*irq_flags=*/flags, /*skip_empty=*/false);
+}
+
+void list_lru_unlock_irqrestore(struct list_lru_one *l, unsigned long *flags)
+{
+ unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/flags);
+}
+
+bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l,
+ struct list_head *item, int nid,
+ struct mem_cgroup *memcg)
+{
if (list_empty(item)) {
list_add_tail(item, &l->list);
- /* Set shrinker bit if the first element was added */
+ /*
+ * Set shrinker bit on the memcg that owns the locked
+ * sublist - lock_list_lru_of_memcg() may have walked up
+ * past a dying memcg, and the bit must be set there.
+ */
if (!l->nr_items++)
set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- unlock_list_lru(l, false);
- atomic_long_inc(&nlru->nr_items);
+ atomic_long_inc(&lru->node[nid].nr_items);
return true;
}
- unlock_list_lru(l, false);
return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);
+bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l,
+ struct list_head *item, int nid)
+{
+ if (!list_empty(item)) {
+ list_del_init(item);
+ l->nr_items--;
+ atomic_long_dec(&lru->node[nid].nr_items);
+ return true;
+ }
+ return false;
+}
+
+/* The caller must ensure the memcg lifetime. */
+bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
+ struct mem_cgroup *memcg)
+{
+ struct list_lru_one *l;
+ bool ret;
+
+ l = list_lru_lock(lru, nid, &memcg);
+ ret = __list_lru_add(lru, l, item, nid, memcg);
+ list_lru_unlock(l);
+ return ret;
+}
+
+bool list_lru_add_irq(struct list_lru *lru, struct list_head *item,
+ int nid, struct mem_cgroup *memcg)
+{
+ struct list_lru_one *l;
+ bool ret;
+
+ l = list_lru_lock_irq(lru, nid, &memcg);
+ ret = __list_lru_add(lru, l, item, nid, memcg);
+ list_lru_unlock_irq(l);
+ return ret;
+}
+
bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
bool ret;
@@ -202,20 +265,13 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj);
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
struct mem_cgroup *memcg)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
- l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
- if (!l)
- return false;
- if (!list_empty(item)) {
- list_del_init(item);
- l->nr_items--;
- unlock_list_lru(l, false);
- atomic_long_dec(&nlru->nr_items);
- return true;
- }
- unlock_list_lru(l, false);
- return false;
+ bool ret;
+
+ l = list_lru_lock(lru, nid, &memcg);
+ ret = __list_lru_del(lru, l, item, nid);
+ list_lru_unlock(l);
+ return ret;
}
bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
@@ -288,7 +344,8 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
unsigned long isolated = 0;
restart:
- l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
+ l = lock_list_lru_of_memcg(lru, nid, &memcg, /*irq=*/irq_off,
+ /*irq_flags=*/NULL, /*skip_empty=*/true);
if (!l)
return isolated;
list_for_each_safe(item, n, &l->list) {
@@ -329,7 +386,7 @@ restart:
BUG();
}
}
- unlock_list_lru(l, irq_off);
+ unlock_list_lru(l, irq_off, NULL);
out:
return isolated;
}
@@ -514,17 +571,14 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
return idx < 0 || xa_load(&lru->xa, idx);
}
-int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
- gfp_t gfp)
+static int __memcg_list_lru_alloc(struct mem_cgroup *memcg,
+ struct list_lru *lru, gfp_t gfp)
{
unsigned long flags;
struct list_lru_memcg *mlru = NULL;
struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
- if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
- return 0;
-
gfp &= GFP_RECLAIM_MASK;
/*
* Because the list_lru can be reparented to the parent cgroup's
@@ -565,6 +619,38 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
return xas_error(&xas);
}
+
+int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
+ gfp_t gfp)
+{
+ if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
+ return 0;
+ return __memcg_list_lru_alloc(memcg, lru, gfp);
+}
+
+int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru,
+ gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ int res;
+
+ if (!list_lru_memcg_aware(lru))
+ return 0;
+
+ /* Fast path when list_lru heads already exist */
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ res = memcg_list_lru_allocated(memcg, lru);
+ rcu_read_unlock();
+ if (likely(res))
+ return 0;
+
+ /* Allocation may block, pin the memcg */
+ memcg = get_mem_cgroup_from_folio(folio);
+ res = __memcg_list_lru_alloc(memcg, lru, gfp);
+ mem_cgroup_put(memcg);
+ return res;
+}
#else
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..cd9bb077072c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
tlb_finish_mmu(madv_behavior->tlb);
}
-static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+/**
+ * check_input_range() - Check if the requested range is valid.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ *
+ * Returns: 0 if the input range is valid, otherwise an error code.
+ */
+static int check_input_range(unsigned long start, size_t len_in)
{
size_t len;
- if (!madvise_behavior_valid(behavior))
- return false;
-
if (!PAGE_ALIGNED(start))
- return false;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return false;
+ return -EINVAL;
if (start + len < start)
- return false;
-
- return true;
-}
+ return -EINVAL;
-/*
- * madvise_should_skip() - Return if the request is invalid or nothing.
- * @start: Start address of madvise-requested address range.
- * @len_in: Length of madvise-requested address range.
- * @behavior: Requested madvise behavior.
- * @err: Pointer to store an error code from the check.
- *
- * If the specified behaviour is invalid or nothing would occur, we skip the
- * operation. This function returns true in the cases, otherwise false. In
- * the former case we store an error on @err.
- */
-static bool madvise_should_skip(unsigned long start, size_t len_in,
- int behavior, int *err)
-{
- if (!is_valid_madvise(start, len_in, behavior)) {
- *err = -EINVAL;
- return true;
- }
- if (start + PAGE_ALIGN(len_in) == start) {
- *err = 0;
- return true;
- }
- return false;
+ return 0;
}
static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
@@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
.tlb = &tlb,
};
- if (madvise_should_skip(start, len_in, behavior, &error))
+ if (!madvise_behavior_valid(behavior))
+ return -EINVAL;
+
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
return error;
+
error = madvise_lock(&madv_behavior);
if (error)
return error;
@@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
size_t len_in = iter_iov_len(iter);
int error;
- if (madvise_should_skip(start, len_in, behavior, &error))
+ error = check_input_range(start, len_in);
+ if (error || !len_in)
ret = error;
else
ret = madvise_do_behavior(start, len_in, &madv_behavior);
@@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
+ if (!madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
/*
* We need only perform this check if we are attempting to manipulate a
* remote process's address space.
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 433bba9dfe71..765069211567 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -5,7 +5,6 @@
#include <linux/mm_inline.h>
#include <linux/pagewalk.h>
#include <linux/backing-dev.h>
-#include <linux/swap_cgroup.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
@@ -14,6 +13,7 @@
#include "internal.h"
#include "swap.h"
+#include "swap_table.h"
#include "memcontrol-v1.h"
/*
@@ -603,19 +603,26 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
local_irq_restore(flags);
}
+#ifdef CONFIG_SWAP
/**
- * memcg1_swapout - transfer a memsw charge to swap
+ * __memcg1_swapout - transfer a memsw charge to swap
* @folio: folio whose memsw charge to transfer
- * @entry: swap entry to move the charge to
+ * @ci: the locked swap cluster holding the swap entries
+ *
+ * Transfer the memsw charge of @folio to the swap entry stored in
+ * folio->swap.
*
- * Transfer the memsw charge of @folio to @entry.
+ * Context: folio must be isolated, unmapped, locked and is just about to
+ * be freed, and caller must disable IRQs and hold the swap cluster lock.
*/
-void memcg1_swapout(struct folio *folio, swp_entry_t entry)
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
{
struct mem_cgroup *memcg, *swap_memcg;
struct obj_cgroup *objcg;
unsigned int nr_entries;
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
@@ -641,7 +648,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
+ mem_cgroup_private_id(swap_memcg));
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -656,8 +664,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
}
/*
- * Interrupts should be disabled here because the caller holds the
- * i_pages lock which is taken with interrupts-off. It is
+ * The caller must hold the swap cluster lock with IRQ off. It is
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
@@ -671,18 +678,24 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
obj_cgroup_put(objcg);
}
-/*
- * memcg1_swapin - uncharge swap slot
- * @entry: the first swap entry for which the pages are charged
- * @nr_pages: number of pages which will be uncharged
+/**
+ * memcg1_swapin - uncharge swap slot on swapin
+ * @folio: folio being swapped in
*
- * Call this function after successfully adding the charged page to swapcache.
+ * Call this function after successfully adding the charged
+ * folio to swapcache.
*
- * Note: This function assumes the page for which swap slot is being uncharged
- * is order 0 page.
+ * Context: The folio has to be in swap cache and locked.
*/
-void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+void memcg1_swapin(struct folio *folio)
{
+ struct swap_cluster_info *ci;
+ unsigned long nr_pages;
+ unsigned short id;
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+
/*
* Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap
@@ -695,15 +708,22 @@ void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(entry, nr_pages);
- }
+ if (!do_memsw_account())
+ return;
+
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ nr_pages = folio_nr_pages(folio);
+ ci = swap_cluster_get_and_lock(folio);
+ id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
+ nr_pages);
+ swap_cluster_unlock(ci);
+ mem_cgroup_uncharge_swap(id, nr_pages);
}
+#endif
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1a4fd2504bcd..56cd4af08232 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,7 +54,6 @@
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
-#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
@@ -64,6 +63,7 @@
#include <linux/sched/isolation.h>
#include <linux/kmemleak.h>
#include "internal.h"
+#include "swap_table.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
@@ -150,15 +150,15 @@ static void obj_cgroup_release(struct percpu_ref *ref)
* However, it can be PAGE_SIZE or (x * PAGE_SIZE).
*
* The following sequence can lead to it:
- * 1) CPU0: objcg == stock->cached_objcg
+ * 1) CPU0: objcg cached in one of stock->cached[i]
* 2) CPU1: we do a small allocation (e.g. 92 bytes),
* PAGE_SIZE bytes are charged
* 3) CPU1: a process from another memcg is allocating something,
* the stock if flushed,
* objcg->nr_charged_bytes = PAGE_SIZE - 92
- * 5) CPU0: we do release this object,
- * 92 bytes are added to stock->nr_bytes
- * 6) CPU0: stock is flushed,
+ * 4) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes[i]
+ * 5) CPU0: stock is flushed,
* 92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
@@ -2018,24 +2018,49 @@ static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
.lock = INIT_LOCAL_TRYLOCK(lock),
};
+/*
+ * NR_OBJ_STOCK is sized so the entire hot path of obj_stock_pcp
+ * (lock, accounting metadata, nr_bytes[] and cached[]) fits within a
+ * single 64-byte cache line on non-debug 64-bit builds. With 5 slots:
+ * lock(1) + index(1) + node_id(2) + slab stats(4) + nr_bytes(10)
+ * + pad(6) + cached(40) == 64 bytes.
+ * A CPU can thus consume/refill/account against five different objcgs
+ * (typically per-node variants of the same memcg) while incurring at
+ * most one cache miss on the stock.
+ */
+#define NR_OBJ_STOCK 5
struct obj_stock_pcp {
local_trylock_t lock;
- unsigned int nr_bytes;
- struct obj_cgroup *cached_objcg;
- struct pglist_data *cached_pgdat;
- int nr_slab_reclaimable_b;
- int nr_slab_unreclaimable_b;
+ int8_t index;
+ int16_t node_id;
+ int16_t nr_slab_reclaimable_b;
+ int16_t nr_slab_unreclaimable_b;
+#if PAGE_SHIFT > 16
+ /*
+ * On rare archs with 256KiB base page size (hexagon and powerpc 44x)
+ * keep nr_bytes to unsigned int as uint16_t cannot represent the full
+e patches/memcg-uint16_t-for-nr_bytes-in-obj_stock_pcp.patch * sub-page remainder. Such archs are not cacheline optimization target.
+ */
+ unsigned int nr_bytes[NR_OBJ_STOCK];
+#else
+ uint16_t nr_bytes[NR_OBJ_STOCK];
+#endif
+ struct obj_cgroup *cached[NR_OBJ_STOCK];
struct work_struct work;
unsigned long flags;
+ uint8_t drain_idx;
};
static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
.lock = INIT_LOCAL_TRYLOCK(lock),
+ .index = -1,
+ .node_id = NUMA_NO_NODE,
};
static DEFINE_MUTEX(percpu_charge_mutex);
+static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i);
static void drain_obj_stock(struct obj_stock_pcp *stock);
static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
@@ -3155,54 +3180,73 @@ static void unlock_stock(struct obj_stock_pcp *stock)
local_unlock(&obj_stock.lock);
}
-/* Call after __refill_obj_stock() to ensure stock->cached_objg == objcg */
+/* Call after __refill_obj_stock() so a slot for objcg exists in the stock */
static void __account_obj_stock(struct obj_cgroup *objcg,
struct obj_stock_pcp *stock, int nr,
struct pglist_data *pgdat, enum node_stat_item idx)
{
- int *bytes;
+ int16_t *bytes;
+ int i;
- if (!stock || READ_ONCE(stock->cached_objcg) != objcg)
+ /*
+ * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make
+ * sure it does not exceed S16_MAX otherwise we need to fix node_id type
+ * in struct obj_stock_pcp.
+ */
+ BUILD_BUG_ON(MAX_NUMNODES >= S16_MAX);
+
+ if (!stock)
+ goto direct;
+
+ for (i = 0; i < NR_OBJ_STOCK; ++i) {
+ if (READ_ONCE(stock->cached[i]) == objcg)
+ break;
+ }
+ if (i == NR_OBJ_STOCK)
goto direct;
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat changes.
+ * accumulating over a page of vmstat data or when the objcg slot or
+ * pgdat the stats belong to changes.
*/
- if (stock->cached_pgdat != pgdat) {
- /* Flush the existing cached vmstat data */
- struct pglist_data *oldpg = stock->cached_pgdat;
+ if (stock->index < 0) {
+ stock->index = i;
+ stock->node_id = pgdat->node_id;
+ } else if (stock->index != i || stock->node_id != pgdat->node_id) {
+ struct obj_cgroup *old = READ_ONCE(stock->cached[stock->index]);
+ struct pglist_data *oldpg = NODE_DATA(stock->node_id);
if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
- stock->cached_pgdat = pgdat;
+ stock->index = i;
+ stock->node_id = pgdat->node_id;
}
bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
: &stock->nr_slab_unreclaimable_b;
+
/*
- * Even for large object >= PAGE_SIZE, the vmstat data will still be
- * cached locally at least once before pushing it out.
+ * Fold @nr into the cached value and decide whether to keep it cached
+ * or flush it directly. Cache the combined value when it fits in the
+ * int16_t storage and either the cache was empty (so even a value
+ * above PAGE_SIZE gets a chance to be canceled by a paired delta) or
+ * the combined value is within the PAGE_SIZE flush threshold.
*/
- if (!*bytes) {
+ nr += *bytes;
+ if (abs(nr) <= S16_MAX && (!*bytes || abs(nr) <= PAGE_SIZE)) {
*bytes = nr;
nr = 0;
} else {
- *bytes += nr;
- if (abs(*bytes) > PAGE_SIZE) {
- nr = *bytes;
- *bytes = 0;
- } else {
- nr = 0;
- }
+ *bytes = 0;
}
direct:
if (nr)
@@ -3213,10 +3257,16 @@ static bool __consume_obj_stock(struct obj_cgroup *objcg,
struct obj_stock_pcp *stock,
unsigned int nr_bytes)
{
- if (objcg == READ_ONCE(stock->cached_objcg) &&
- stock->nr_bytes >= nr_bytes) {
- stock->nr_bytes -= nr_bytes;
- return true;
+ int i;
+
+ for (i = 0; i < NR_OBJ_STOCK; ++i) {
+ if (READ_ONCE(stock->cached[i]) != objcg)
+ continue;
+ if (stock->nr_bytes[i] >= nr_bytes) {
+ stock->nr_bytes[i] -= nr_bytes;
+ return true;
+ }
+ return false;
}
return false;
@@ -3237,16 +3287,42 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
return ret;
}
-static void drain_obj_stock(struct obj_stock_pcp *stock)
+/* Flush the cached slab stats (if any) back to their owning objcg/pgdat. */
+static void drain_obj_stock_stats(struct obj_stock_pcp *stock)
{
- struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
+ struct obj_cgroup *old;
+ struct pglist_data *oldpg;
+
+ if (stock->index < 0)
+ return;
+
+ old = READ_ONCE(stock->cached[stock->index]);
+ oldpg = NODE_DATA(stock->node_id);
+
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->index = -1;
+ stock->node_id = NUMA_NO_NODE;
+}
+
+static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i)
+{
+ struct obj_cgroup *old = READ_ONCE(stock->cached[i]);
if (!old)
return;
- if (stock->nr_bytes) {
- unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
- unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+ if (stock->nr_bytes[i]) {
+ unsigned int nr_pages = stock->nr_bytes[i] >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes[i] & (PAGE_SIZE - 1);
if (nr_pages) {
struct mem_cgroup *memcg;
@@ -3272,44 +3348,43 @@ static void drain_obj_stock(struct obj_stock_pcp *stock)
* so it might be changed in the future.
*/
atomic_add(nr_bytes, &old->nr_charged_bytes);
- stock->nr_bytes = 0;
+ stock->nr_bytes[i] = 0;
}
- /*
- * Flush the vmstat data in current stock
- */
- if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
- if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
- NR_SLAB_RECLAIMABLE_B,
- stock->nr_slab_reclaimable_b);
- stock->nr_slab_reclaimable_b = 0;
- }
- if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
- NR_SLAB_UNRECLAIMABLE_B,
- stock->nr_slab_unreclaimable_b);
- stock->nr_slab_unreclaimable_b = 0;
- }
- stock->cached_pgdat = NULL;
- }
+ /* Flush vmstat data when its owning slot is being drained. */
+ if (stock->index == i)
+ drain_obj_stock_stats(stock);
- WRITE_ONCE(stock->cached_objcg, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
obj_cgroup_put(old);
}
+static void drain_obj_stock(struct obj_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_OBJ_STOCK; ++i)
+ drain_obj_stock_slot(stock, i);
+}
+
static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
- struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
+ struct obj_cgroup *objcg;
struct mem_cgroup *memcg;
bool flush = false;
+ int i;
rcu_read_lock();
- if (objcg) {
+ for (i = 0; i < NR_OBJ_STOCK; ++i) {
+ objcg = READ_ONCE(stock->cached[i]);
+ if (!objcg)
+ continue;
memcg = obj_cgroup_memcg(objcg);
- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) {
flush = true;
+ break;
+ }
}
rcu_read_unlock();
@@ -3322,6 +3397,8 @@ static void __refill_obj_stock(struct obj_cgroup *objcg,
bool allow_uncharge)
{
unsigned int nr_pages = 0;
+ unsigned int stock_nr_bytes;
+ int i, slot = -1, empty_slot = -1;
if (!stock) {
nr_pages = nr_bytes >> PAGE_SHIFT;
@@ -3330,21 +3407,52 @@ static void __refill_obj_stock(struct obj_cgroup *objcg,
goto out;
}
- if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- drain_obj_stock(stock);
+ for (i = 0; i < NR_OBJ_STOCK; ++i) {
+ struct obj_cgroup *cached = READ_ONCE(stock->cached[i]);
+
+ if (!cached) {
+ if (empty_slot == -1)
+ empty_slot = i;
+ continue;
+ }
+ if (cached == objcg) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (slot == -1) {
+ slot = empty_slot;
+ if (slot == -1) {
+ slot = stock->drain_idx++;
+ if (stock->drain_idx == NR_OBJ_STOCK)
+ stock->drain_idx = 0;
+ drain_obj_stock_slot(stock, slot);
+ }
obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ /*
+ * Keep the xchg result in the unsigned int local; storing
+ * it directly into stock->nr_bytes[slot] (uint16_t) would
+ * silently truncate values >= U16_MAX and bypass the flush
+ * guard below, leaking page-counter charges.
+ */
+ stock_nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
+ WRITE_ONCE(stock->cached[slot], objcg);
allow_uncharge = true; /* Allow uncharge when objcg changes */
+ } else {
+ stock_nr_bytes = stock->nr_bytes[slot];
}
- stock->nr_bytes += nr_bytes;
- if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
- nr_pages = stock->nr_bytes >> PAGE_SHIFT;
- stock->nr_bytes &= (PAGE_SIZE - 1);
+ stock_nr_bytes += nr_bytes;
+
+ if ((allow_uncharge && (stock_nr_bytes > PAGE_SIZE)) ||
+ stock_nr_bytes > U16_MAX) {
+ nr_pages = stock_nr_bytes >> PAGE_SHIFT;
+ stock_nr_bytes &= (PAGE_SIZE - 1);
}
+ stock->nr_bytes[slot] = stock_nr_bytes;
out:
if (nr_pages)
@@ -4005,11 +4113,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg1_alloc_events(memcg))
goto fail;
+ pstatc_pcpu = parent ? parent->vmstats_percpu : NULL;
for_each_possible_cpu(cpu) {
- if (parent)
- pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
+ statc->parent_pcpu = pstatc_pcpu;
statc->vmstats = memcg->vmstats;
}
@@ -4037,11 +4144,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
- INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
- memcg->deferred_split_queue.split_queue_len = 0;
-#endif
lru_gen_init_memcg(memcg);
return memcg;
fail:
@@ -4192,11 +4294,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
- reparent_deferred_split_queue(memcg);
/*
- * The reparenting of objcg must be after the reparenting of the
- * list_lru and deferred_split_queue above, which ensures that they will
- * not mistakenly get the parent list_lru and deferred_split_queue.
+ * The reparenting of objcg must be after the reparenting of
+ * the list_lru in memcg_offline_kmem(), which ensures that
+ * they will not mistakenly get the parent list_lru.
*/
memcg_reparent_objcgs(memcg);
reparent_shrinker_deferred(memcg);
@@ -5080,27 +5181,25 @@ out:
/**
* mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
- * @folio: folio to charge.
+ * @folio: the folio to charge
+ * @id: memory cgroup id
* @mm: mm context of the victim
* @gfp: reclaim mode
- * @entry: swap entry for which the folio is allocated
*
* This function charges a folio allocated for swapin. Please call this before
* adding the folio to the swapcache.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
- gfp_t gfp, swp_entry_t entry)
+int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id,
+ struct mm_struct *mm, gfp_t gfp)
{
struct mem_cgroup *memcg;
- unsigned short id;
int ret;
if (mem_cgroup_disabled())
return 0;
- id = lookup_swap_cgroup_id(entry);
rcu_read_lock();
memcg = mem_cgroup_from_private_id(id);
if (!memcg || !css_tryget_online(&memcg->css))
@@ -5474,15 +5573,15 @@ int __init mem_cgroup_init(void)
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
- * @entry: swap entry to charge
*
- * Try to charge @folio's memcg for the swap space at @entry.
+ * Try to charge @folio's memcg for the swap space at folio->swap.
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct folio *folio)
{
unsigned int nr_pages = folio_nr_pages(folio);
+ struct swap_cluster_info *ci;
struct page_counter *counter;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
@@ -5497,7 +5596,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- if (!entry.val) {
+ if (!folio_test_swapcache(folio)) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
rcu_read_unlock();
return 0;
@@ -5516,22 +5615,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
}
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry);
+ ci = swap_cluster_get_and_lock(folio);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+ mem_cgroup_private_id(memcg));
+ swap_cluster_unlock(ci);
return 0;
}
/**
* __mem_cgroup_uncharge_swap - uncharge swap space
- * @entry: swap entry to uncharge
+ * @id: cgroup id to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
- unsigned short id;
- id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_private_id(id);
if (memcg) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d47aef256a32..51508a55c405 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -172,23 +172,11 @@ static int __page_handle_poison(struct page *page)
{
int ret;
- /*
- * zone_pcp_disable() can't be used here. It will
- * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
- * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
- * optimization is enabled. This will break current lock dependency
- * chain and leads to deadlock.
- * Disabling pcp before dissolving the page was a deterministic
- * approach because we made sure that those pages cannot end up in any
- * PCP list. Draining PCP lists expels those pages to the buddy system,
- * but nothing guarantees that those pages do not get back to a PCP
- * queue if we need to refill those.
- */
+ zone_pcp_disable(page_zone(page));
ret = dissolve_free_hugetlb_folio(page_folio(page));
- if (!ret) {
- drain_all_pages(page_zone(page));
+ if (!ret)
ret = take_page_off_buddy(page);
- }
+ zone_pcp_enable(page_zone(page));
return ret;
}
@@ -459,7 +447,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
* Only do anything when FORCEKILL is set, otherwise just free the
* list (this is used for clean pages which do not need killing)
*/
-static void kill_procs(struct list_head *to_kill, int forcekill,
+static void kill_procs(struct list_head *to_kill, bool forcekill,
unsigned long pfn, int flags)
{
struct to_kill *tk, *next;
@@ -1418,7 +1406,7 @@ try_again:
* We raced with (possibly temporary) unhandlable
* page, retry.
*/
- if (pass++ < 3) {
+ if (pass++ < GET_PAGE_MAX_RETRY_NUM) {
shake_page(p);
goto try_again;
}
@@ -1582,7 +1570,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
{
LIST_HEAD(tokill);
bool unmap_success;
- int forcekill;
+ bool forcekill;
bool mlocked = folio_test_mlocked(folio);
/*
@@ -1703,7 +1691,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
unmap_mapping_range(mapping, start, size, 0);
}
- kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags);
+ kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags);
}
/*
@@ -2027,13 +2015,14 @@ out_unlock:
* So some of prechecks for hwpoison (pinning, and testing/setting
* PageHWPoison) should be done in single hugetlb_lock range.
* Returns:
- * 0 - not hugetlb, or recovered
+ * 0 - recovered
+ * -ENOENT - no hugetlb page
* -EBUSY - not recovered
* -EOPNOTSUPP - hwpoison_filter'ed
* -EHWPOISON - folio or exact page already poisoned
* -EFAULT - kill_accessing_process finds current->mm null
*/
-static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+static int try_memory_failure_hugetlb(unsigned long pfn, int flags)
{
int res, rv;
struct page *p = pfn_to_page(pfn);
@@ -2041,13 +2030,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
unsigned long page_flags;
bool migratable_cleared = false;
- *hugetlb = 1;
retry:
res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
switch (res) {
case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */
- *hugetlb = 0;
- return 0;
+ return -ENOENT;
case MF_HUGETLB_RETRY:
if (!(flags & MF_NO_RETRY)) {
flags |= MF_NO_RETRY;
@@ -2108,9 +2095,9 @@ retry:
}
#else
-static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags)
{
- return 0;
+ return -ENOENT;
}
static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
@@ -2348,7 +2335,6 @@ int memory_failure(unsigned long pfn, int flags)
int res = 0;
unsigned long page_flags;
bool retry = true;
- int hugetlb = 0;
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -2387,8 +2373,11 @@ int memory_failure(unsigned long pfn, int flags)
}
try_again:
- res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
- if (hugetlb)
+ res = try_memory_failure_hugetlb(pfn, flags);
+ /*
+ * -ENOENT means the page we found is not hugetlb, so proceed with normal page handling
+ */
+ if (res != -ENOENT)
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index 86a973119bd4..56be920c56d7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
* Handle the case of a page which we actually need to copy to a new page,
* either due to COW or unsharing.
*
- * Called with mmap_lock locked and the old page referenced, but
- * without the ptl held.
+ * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK)
+ * and the old page referenced, but without the ptl held.
*
* High level logic flow:
*
@@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio,
* though the page will change only once the write actually happens. This
* avoids a few races, and potentially makes it more efficient.
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with
+ * the same lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
@@ -4609,35 +4609,13 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
-static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio;
- softleaf_t entry;
-
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
- if (!folio)
- return NULL;
-
- entry = softleaf_from_pte(vmf->orig_pte);
- if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- GFP_KERNEL, entry)) {
- folio_put(folio);
- return NULL;
- }
-
- return folio;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * Check if the PTEs within a range are contiguous swap entries
- * and have consistent swapcache, zeromap.
+ * Check if the PTEs within a range are contiguous swap entries.
*/
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
unsigned long addr;
- softleaf_t entry;
int idx;
pte_t pte;
@@ -4647,20 +4625,13 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
return false;
- entry = softleaf_from_pte(pte);
- if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
- return false;
-
/*
* swap_read_folio() can't handle the case a large folio is hybridly
* from different backends. And they are likely corner cases. Similar
* things might be added once zswap support large folios.
*/
- if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
- return false;
- if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
+ if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
return false;
-
return true;
}
@@ -4687,16 +4658,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
return orders;
}
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long orders;
- struct folio *folio;
unsigned long addr;
softleaf_t entry;
spinlock_t *ptl;
pte_t *pte;
- gfp_t gfp;
int order;
/*
@@ -4704,7 +4673,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* maintain the uffd semantics.
*/
if (unlikely(userfaultfd_armed(vma)))
- goto fallback;
+ return 0;
/*
* A large swapped out folio could be partially or fully in zswap. We
@@ -4712,7 +4681,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* folio.
*/
if (!zswap_never_enabled())
- goto fallback;
+ return 0;
entry = softleaf_from_pte(vmf->orig_pte);
/*
@@ -4726,12 +4695,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
vmf->address, orders);
if (!orders)
- goto fallback;
+ return 0;
pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address & PMD_MASK, &ptl);
if (unlikely(!pte))
- goto fallback;
+ return 0;
/*
* For do_swap_page, find the highest order where the aligned range is
@@ -4747,29 +4716,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
pte_unmap_unlock(pte, ptl);
- /* Try allocating the highest of the remaining orders. */
- gfp = vma_thp_gfp_mask(vma);
- while (orders) {
- addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
- gfp, entry))
- return folio;
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
- folio_put(folio);
- }
- count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
- order = next_order(&orders, order);
- }
-
-fallback:
- return __alloc_swap_folio(vmf);
+ return orders;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf)
{
- return __alloc_swap_folio(vmf);
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -4785,12 +4737,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
- * We return with the mmap_lock locked or unlocked in the same cases
- * as does filemap_fault().
+ * When returning, the lock may have been released in the same cases
+ * as done by filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
@@ -4875,23 +4827,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (folio)
swap_update_readahead(folio, vma, vmf->address);
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = alloc_swap_folio(vmf);
- if (folio) {
- /*
- * folio is charged, so swapin can only fail due
- * to raced swapin and return NULL.
- */
- swapcache = swapin_folio(entry, folio);
- if (swapcache != folio)
- folio_put(folio);
- folio = swapcache;
- }
- } else {
+ /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+ folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
+ thp_swapin_suitable_orders(vmf) | BIT(0),
+ vmf, NULL, 0);
+ else
folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
- }
- if (!folio) {
+ if (IS_ERR_OR_NULL(folio)) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
@@ -4901,6 +4845,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (likely(vmf->pte &&
pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
+ folio = NULL;
goto unlock;
}
@@ -5270,24 +5215,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr);
- if (folio) {
- if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
- count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- folio_put(folio);
- goto next;
- }
- folio_throttle_swaprate(folio, gfp);
- /*
- * When a folio is not zeroed during allocation
- * (__GFP_ZERO not used) or user folios require special
- * handling, folio_zero_user() is used to make sure
- * that the page corresponding to the faulting address
- * will be hot in the cache after zeroing.
- */
- if (user_alloc_needs_zeroing())
- folio_zero_user(folio, vmf->address);
- return folio;
+ if (!folio)
+ goto next;
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ folio_put(folio);
+ goto next;
}
+ if (order > 1 && folio_memcg_alloc_deferred(folio)) {
+ folio_put(folio);
+ goto fallback;
+ }
+ folio_throttle_swaprate(folio, gfp);
+ /*
+ * When a folio is not zeroed during allocation
+ * (__GFP_ZERO not used) or user folios require special
+ * handling, folio_zero_user() is used to make sure
+ * that the page corresponding to the faulting address
+ * will be hot in the cache after zeroing.
+ */
+ if (user_alloc_needs_zeroing())
+ folio_zero_user(folio, vmf->address);
+ return folio;
next:
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
order = next_order(&orders, order);
@@ -5330,9 +5279,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte,
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_lock still held, but pte unmapped and unlocked.
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked.
+ * We return with the lock still held, but pte unmapped and unlocked.
+ * If VM_FAULT_RETRY is returned, the lock may have been released.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
@@ -5440,9 +5390,10 @@ oom:
}
/*
- * The mmap_lock must have been held on entry, and may have been
- * released depending on flags and vma->vm_ops->fault() return value.
- * See filemap_fault() and __lock_page_retry().
+ * Either the VMA lock or the mmap_lock must have been held on entry
+ * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on
+ * flags and vma->vm_ops->fault() return value.
+ * See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
@@ -5451,18 +5402,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
vm_fault_t ret;
/*
- * Preallocate pte before we take page_lock because this might lead to
- * deadlocks for memcg reclaim which waits for pages under writeback:
- * lock_page(A)
- * SetPageWriteback(A)
- * unlock_page(A)
- * lock_page(B)
- * lock_page(B)
+ * Preallocate pte before we take folio lock because this might lead to
+ * deadlocks for memcg reclaim which waits for folios under writeback:
+ * folio_lock(A)
+ * folio_set_writeback(A)
+ * folio_unlock(A)
+ * folio_lock(B)
+ * folio_lock(B)
* pte_alloc_one
* shrink_folio_list
- * wait_on_page_writeback(A)
- * SetPageWriteback(B)
- * unlock_page(B)
+ * folio_wait_writeback(A)
+ * folio_set_writeback(B)
+ * folio_unlock(B)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
@@ -5480,7 +5431,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
if (unlikely(PageHWPoison(vmf->page))) {
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
- if (page_mapped(vmf->page))
+ if (folio_mapped(folio))
unmap_mapping_folio(folio);
/* Retry if a clean folio was removed from the cache. */
if (mapping_evict_folio(folio->mapping, folio))
@@ -6003,11 +5954,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
/*
- * We enter with non-exclusive mmap_lock (to exclude vma changes,
- * but allow concurrent faults).
- * The mmap_lock may have been released depending on flags and our
+ * We enter with either the VMA lock or the mmap_lock held (see
+ * FAULT_FLAG_VMA_LOCK).
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
- * If mmap_lock is released, vma may become invalid (for example
+ * If the lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
@@ -6374,10 +6325,11 @@ static void fix_spurious_fault(struct vm_fault *vmf,
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
- * concurrent faults).
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our return value.
+ * The mmap_lock or VMA lock may have been released depending on flags
+ * and our return value.
* See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
@@ -6458,8 +6410,8 @@ unlock:
/*
* On entry, we hold either the VMA lock or the mmap_lock
- * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
- * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in
+ * the result, the lock is not held on exit. See filemap_fault()
* and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
@@ -6691,9 +6643,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
/*
* By the time we get here, we already hold either the VMA lock or the
- * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
+ * mmap_lock (see FAULT_FLAG_VMA_LOCK).
*
- * The mmap_lock may have been released depending on flags and our
+ * The lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 40c7915dabe0..7ac19fab2263 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
+ * @pgmap: device page map or %NULL if not ZONE_DEVICE
*
* Generic helper function to remove section mappings and sysfs entries
* for the section of the memory we are removing. Caller needs to make
@@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
* calling offline_pages().
*/
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
@@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- sparse_remove_section(pfn, cur_nr_pages, altmap);
+ sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap);
}
}
@@ -1402,6 +1403,12 @@ bool mhp_supports_memmap_on_memory(void)
}
EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
+static void altmap_free(struct vmem_altmap *altmap)
+{
+ WARN_ONCE(altmap->alloc, "Altmap not fully unmapped");
+ kfree(altmap);
+}
+
static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
@@ -1416,22 +1423,17 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
struct vmem_altmap *altmap = NULL;
struct memory_block *mem;
- mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start)));
+ mem = memory_block_get(phys_to_block_id(cur_start));
if (WARN_ON_ONCE(!mem))
continue;
altmap = mem->altmap;
mem->altmap = NULL;
- /* drop the ref. we got via find_memory_block() */
- put_device(&mem->dev);
+ memory_block_put(mem);
remove_memory_block_devices(cur_start, memblock_size);
-
- arch_remove_memory(cur_start, memblock_size, altmap);
-
- /* Verify that all vmemmap pages have actually been freed. */
- WARN(altmap->alloc, "Altmap not fully unmapped");
- kfree(altmap);
+ arch_remove_memory(cur_start, memblock_size, altmap, NULL);
+ altmap_free(altmap);
}
}
@@ -1462,7 +1464,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
/* call arch's memory hotadd */
ret = arch_add_memory(nid, cur_start, memblock_size, &params);
if (ret < 0) {
- kfree(params.altmap);
+ altmap_free(params.altmap);
goto out;
}
@@ -1470,8 +1472,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
ret = create_memory_block_devices(cur_start, memblock_size, nid,
params.altmap, group);
if (ret) {
- arch_remove_memory(cur_start, memblock_size, NULL);
- kfree(params.altmap);
+ arch_remove_memory(cur_start, memblock_size, params.altmap, NULL);
+ altmap_free(params.altmap);
goto out;
}
}
@@ -1556,7 +1558,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, nid, NULL, group);
if (ret) {
- arch_remove_memory(start, size, params.altmap);
+ arch_remove_memory(start, size, params.altmap, NULL);
goto error;
}
}
@@ -2268,7 +2270,7 @@ static int try_remove_memory(u64 start, u64 size)
* No altmaps present, do the removal directly
*/
remove_memory_block_devices(start, size);
- arch_remove_memory(start, size, NULL);
+ arch_remove_memory(start, size, NULL, NULL);
} else {
/* all memblocks in the range have altmaps */
remove_memory_blocks_and_altmaps(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4e4421b22b59..36699fabd3c2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2865,7 +2865,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
case MPOL_WEIGHTED_INTERLEAVE:
- return !!nodes_equal(a->nodes, b->nodes);
+ return nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
default:
diff --git a/mm/memremap.c b/mm/memremap.c
index 053842d45cb1..81766d822400 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), NULL);
+ PHYS_PFN(range_len(range)), NULL, pgmap);
} else {
arch_remove_memory(range->start, range_len(range),
- pgmap_altmap(pgmap));
+ pgmap_altmap(pgmap), pgmap);
kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index 8a64291ab5b4..d9b23909d716 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
* This is safe because nobody is using it except us.
*/
enum {
- PAGE_WAS_MAPPED = BIT(0),
- PAGE_WAS_MLOCKED = BIT(1),
- PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+ FOLIO_WAS_MAPPED = BIT(0),
+ FOLIO_WAS_MLOCKED = BIT(1),
+ FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED,
};
static void __migrate_folio_record(struct folio *dst,
- int old_page_state,
- struct anon_vma *anon_vma)
+ int old_folio_state, struct anon_vma *anon_vma)
{
- dst->private = (void *)anon_vma + old_page_state;
+ dst->private = (void *)anon_vma + old_folio_state;
}
static void __migrate_folio_extract(struct folio *dst,
- int *old_page_state,
- struct anon_vma **anon_vmap)
+ int *old_folio_state, struct anon_vma **anon_vmap)
{
unsigned long private = (unsigned long)dst->private;
- *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
- *old_page_state = private & PAGE_OLD_STATES;
+ *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES);
+ *old_folio_state = private & FOLIO_OLD_STATES;
dst->private = NULL;
}
@@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
{
struct folio *dst;
int rc = -EAGAIN;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool locked = false;
bool dst_locked = false;
@@ -1253,12 +1251,12 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
}
locked = true;
if (folio_test_mlocked(src))
- old_page_state |= PAGE_WAS_MLOCKED;
+ old_folio_state |= FOLIO_WAS_MLOCKED;
if (folio_test_writeback(src)) {
/*
* Only in the case of a full synchronous migration is it
- * necessary to wait for PageWriteback. In the async case,
+ * necessary to wait for writeback. In the async case,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
*/
@@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
dst_locked = true;
if (unlikely(page_has_movable_ops(&src->page))) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
- old_page_state |= PAGE_WAS_MAPPED;
+ old_folio_state |= FOLIO_WAS_MAPPED;
}
if (!folio_mapped(src)) {
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return 0;
}
@@ -1344,7 +1342,7 @@ out:
if (rc == -EAGAIN)
ret = NULL;
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, locked, ret);
migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
@@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct list_head *ret)
{
int rc;
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
bool src_deferred_split = false;
bool src_partially_mapped = false;
struct list_head *prev;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
prev = dst->lru.prev;
list_del(&dst->lru);
@@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
* isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst);
- if (old_page_state & PAGE_WAS_MLOCKED)
+ if (old_folio_state & FOLIO_WAS_MLOCKED)
lru_add_drain();
- if (old_page_state & PAGE_WAS_MAPPED)
+ if (old_folio_state & FOLIO_WAS_MAPPED)
remove_migration_ptes(src, dst, 0);
out_unlock_both:
@@ -1439,11 +1437,11 @@ out:
*/
if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
- __migrate_folio_record(dst, old_page_state, anon_vma);
+ __migrate_folio_record(dst, old_folio_state, anon_vma);
return rc;
}
- migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+ migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
@@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios,
dst = list_first_entry(dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, src_folios, lru) {
- int old_page_state = 0;
+ int old_folio_state = 0;
struct anon_vma *anon_vma = NULL;
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ __migrate_folio_extract(dst, &old_folio_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED,
anon_vma, true, ret_folios);
list_del(&dst->lru);
migrate_folio_undo_dst(dst, true, put_new_folio, private);
@@ -2557,24 +2555,29 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
}
task = find_get_task_by_vpid(pid);
- if (!task) {
+ if (!task)
return ERR_PTR(-ESRCH);
- }
+ if (down_read_killable(&task->signal->exec_update_lock)) {
+ mm = ERR_PTR(-EINTR);
+ goto out;
+ }
/*
* Check if this process has the right to modify the specified
* process. Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
mm = ERR_PTR(-EPERM);
- goto out;
+ goto unlock;
}
mm = ERR_PTR(security_task_movememory(task));
if (IS_ERR(mm))
- goto out;
+ goto unlock;
*mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
+unlock:
+ up_read(&task->signal->exec_update_lock);
out:
put_task_struct(task);
if (!mm)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 19cd14b34114..554754eb26ff 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
bool flush = false;
unsigned long i;
- VM_WARN_ON_FOLIO(!folio, folio);
- VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+ VM_WARN_ON_ONCE(!folio);
if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
return -EINVAL;
@@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
if (userfaultfd_missing(vma))
goto unlock_abort;
- if (!pmd_none(*pmdp)) {
- if (!is_huge_zero_pmd(*pmdp))
- goto unlock_abort;
+ if (is_huge_zero_pmd(*pmdp))
flush = true;
- } else if (!pmd_none(*pmdp))
+ else if (!pmd_none(*pmdp))
goto unlock_abort;
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index dc5d93125cdd..65623f95bec3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -674,6 +674,20 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_ZONE_DEVICE
+static __meminit void pageblock_migratetype_init_range(unsigned long pfn,
+ unsigned long nr_pages, int migratetype)
+{
+ const unsigned long end = pfn + nr_pages;
+
+ for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) {
+ init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+ }
+}
+#endif
+
/*
* Initialize a reserved page unconditionally, finding its zone first.
*/
@@ -1012,21 +1026,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
page->zone_device_data = NULL;
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (pageblock_aligned(pfn)) {
- init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
- cond_resched();
- }
-
- /*
* ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
* directly to the driver page allocator which will set the page count
* to 1 when allocating the page.
@@ -1056,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* of how the sparse_vmemmap internals handle compound pages in the lack
* of an altmap. See vmemmap_populate_compound_pages().
*/
-static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+static inline unsigned long compound_nr_pages(unsigned long pfn,
+ struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- if (!vmemmap_can_optimize(altmap, pgmap))
+ /*
+ * If DAX memory is hot-plugged into an unoccupied subsection
+ * of an early section, the unoptimized boot memmap is reused.
+ * See section_activate().
+ */
+ if (early_section(__pfn_to_section(pfn)) ||
+ !vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
@@ -1122,13 +1128,18 @@ void __ref memmap_init_zone_device(struct zone *zone,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+
if (pfns_per_compound == 1)
continue;
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- compound_nr_pages(altmap, pgmap));
+ compound_nr_pages(pfn, altmap, pgmap));
}
+ pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE);
+
pr_debug("%s initialised %lu pages in %ums\n", __func__,
nr_pages, jiffies_to_msecs(jiffies - start));
}
@@ -1362,19 +1373,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void pgdat_init_split_queue(struct pglist_data *pgdat)
-{
- struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
-
- spin_lock_init(&ds_queue->split_queue_lock);
- INIT_LIST_HEAD(&ds_queue->split_queue);
- ds_queue->split_queue_len = 0;
-}
-#else
-static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
-#endif
-
#ifdef CONFIG_COMPACTION
static void pgdat_init_kcompactd(struct pglist_data *pgdat)
{
@@ -1390,8 +1388,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
pgdat_resize_init(pgdat);
pgdat_kswapd_lock_init(pgdat);
-
- pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
@@ -1418,11 +1414,14 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
static void __meminit zone_init_free_lists(struct zone *zone)
{
- unsigned int order, t;
- for_each_migratetype_order(order, t) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ struct list_head *list;
+ unsigned int order;
+
+ for_each_free_list(list, zone, order)
+ INIT_LIST_HEAD(list);
+
+ for (order = 0; order < NR_PAGE_ORDERS; order++)
zone->free_area[order].nr_free = 0;
- }
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
diff --git a/mm/mmap.c b/mm/mmap.c
index 5754d1c36462..2311ae7c2ff4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
break;
case MAP_DROPPABLE:
if (VM_DROPPABLE == VM_NONE)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
/*
* A locked or stack area makes no sense to be droppable.
*
diff --git a/mm/mseal.c b/mm/mseal.c
index e2093ae3d25c..9781647483d1 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -8,6 +8,7 @@
*/
#include <linux/mempolicy.h>
+#include <linux/minmax.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
@@ -65,8 +66,8 @@ static int mseal_apply(struct mm_struct *mm,
prev = vma;
for_each_vma_range(vmi, vma, end) {
- const unsigned long curr_start = MAX(vma->vm_start, start);
- const unsigned long curr_end = MIN(vma->vm_end, end);
+ const unsigned long curr_start = max(vma->vm_start, start);
+ const unsigned long curr_end = min(vma->vm_end, end);
if (!vma_test(vma, VMA_SEALED_BIT)) {
vma_flags_t vma_flags = vma->flags;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d49c254174da..f7db8f049bd2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ typedef int __bitwise fpi_t;
/* Free the page without taking locks. Rely on trylock only. */
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+/* free_pages_prepare() has already been called for page(s) being freed. */
+#define FPI_PREPARED ((__force fpi_t)BIT(3))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -282,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+/*
+ * When page allocations stall for longer than a threshold,
+ * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning
+ * will be printed during this duration for the entire system.
+ */
+#define ALLOC_STALL_WARN_MSECS (10 * 1000UL)
+static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES;
+
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static bool cond_accept_memory(struct zone *zone, unsigned int order,
int alloc_flags);
@@ -353,7 +364,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
- BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > PAGEBLOCK_MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
bitmap = get_pageblock_bitmap(page, pfn);
@@ -423,10 +434,10 @@ bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
* Use get_pfnblock_migratetype() if caller already has both @page and @pfn
* to save a call to page_to_pfn().
*/
-__always_inline enum migratetype
+enum migratetype
get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
- unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+ unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK;
unsigned long flags;
flags = __get_pfnblock_flags_mask(page, pfn, mask);
@@ -435,7 +446,7 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
if (flags & BIT(PB_migrate_isolate))
return MIGRATE_ISOLATE;
#endif
- return flags & MIGRATETYPE_MASK;
+ return flags & PAGEBLOCK_MIGRATETYPE_MASK;
}
/**
@@ -523,11 +534,11 @@ static void set_pageblock_migratetype(struct page *page,
}
VM_WARN_ONCE(get_pageblock_isolate(page),
"Use clear_pageblock_isolate() to unisolate pageblock");
- /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+ /* PAGEBLOCK_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
__set_pfnblock_flags_mask(page, page_to_pfn(page),
(unsigned long)migratetype,
- MIGRATETYPE_AND_ISO_MASK);
+ PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK);
}
void __meminit init_pageblock_migratetype(struct page *page,
@@ -553,7 +564,7 @@ void __meminit init_pageblock_migratetype(struct page *page,
flags |= BIT(PB_migrate_isolate);
#endif
__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
- MIGRATETYPE_AND_ISO_MASK);
+ PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -639,19 +650,12 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ bool movable = migratetype == MIGRATE_MOVABLE;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool movable;
- if (order > PAGE_ALLOC_COSTLY_ORDER) {
- VM_BUG_ON(!is_pmd_order(order));
-
- movable = migratetype == MIGRATE_MOVABLE;
-
- return NR_LOWORDER_PCP_LISTS + movable;
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return NR_LOWORDER_PCP_LISTS + movable;
}
-#else
- VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-#endif
return (MIGRATE_PCPTYPES * order) + migratetype;
}
@@ -660,12 +664,10 @@ static inline int pindex_to_order(unsigned int pindex)
{
int order = pindex / MIGRATE_PCPTYPES;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pindex >= NR_LOWORDER_PCP_LISTS)
- order = HPAGE_PMD_ORDER;
-#else
- VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-#endif
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pindex >= NR_LOWORDER_PCP_LISTS)
+ order = HPAGE_PMD_ORDER;
+ }
return order;
}
@@ -1211,14 +1213,18 @@ static inline bool should_skip_kasan_poison(struct page *page)
return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
-static void kernel_init_pages(struct page *page, int numpages)
+static void clear_highpages_kasan_tagged(struct page *page, int numpages)
{
- int i;
-
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++)
- clear_highpage_kasan_tagged(page + i);
+ if (!IS_ENABLED(CONFIG_HIGHMEM)) {
+ clear_pages(kasan_reset_tag(page_address(page)), numpages);
+ } else {
+ int i;
+
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
+ }
kasan_enable_current();
}
@@ -1303,8 +1309,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
#endif /* CONFIG_MEM_ALLOC_PROFILING */
-__always_inline bool __free_pages_prepare(struct page *page,
- unsigned int order, fpi_t fpi_flags)
+static __always_inline bool __free_pages_prepare(struct page *page,
+ unsigned int order, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page);
@@ -1312,6 +1318,9 @@ __always_inline bool __free_pages_prepare(struct page *page,
bool compound = PageCompound(page);
struct folio *folio = page_folio(page);
+ if (fpi_flags & FPI_PREPARED)
+ return true;
+
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
@@ -1423,7 +1432,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1451,7 +1460,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp,
int pindex)
{
- unsigned long flags;
unsigned int order;
struct page *page;
@@ -1464,7 +1472,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (count > 0) {
struct list_head *list;
@@ -1496,8 +1504,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
}
/* Split a multi-block free page into its individual pageblocks. */
@@ -1848,7 +1854,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
/* If memory is still not initialized, initialize it now. */
if (init)
- kernel_init_pages(page, 1 << order);
+ clear_highpages_kasan_tagged(page, 1 << order);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -2125,15 +2131,15 @@ static bool __move_freepages_block_isolate(struct zone *zone,
}
move:
- /* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+ /* Use PAGEBLOCK_MIGRATETYPE_MASK to get non-isolate migratetype */
if (isolate) {
from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
- MIGRATETYPE_MASK);
+ PAGEBLOCK_MIGRATETYPE_MASK);
to_mt = MIGRATE_ISOLATE;
} else {
from_mt = MIGRATE_ISOLATE;
to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
- MIGRATETYPE_MASK);
+ PAGEBLOCK_MIGRATETYPE_MASK);
}
__move_freepages_block(zone, start_pfn, from_mt, to_mt);
@@ -2244,25 +2250,29 @@ static bool should_try_claim_block(unsigned int order, int start_mt)
* we would do this whole-block claiming. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool claimable)
+enum fallback_result
+find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool claimable, int *mt_out)
{
int i;
if (claimable && !should_try_claim_block(order, migratetype))
- return -2;
+ return FALLBACK_NOCLAIM;
if (area->nr_free == 0)
- return -1;
+ return FALLBACK_EMPTY;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
int fallback_mt = fallbacks[migratetype][i];
- if (!free_area_empty(area, fallback_mt))
- return fallback_mt;
+ if (!free_area_empty(area, fallback_mt)) {
+ if (mt_out)
+ *mt_out = fallback_mt;
+ return FALLBACK_FOUND;
+ }
}
- return -1;
+ return FALLBACK_EMPTY;
}
/*
@@ -2372,16 +2382,16 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
*/
for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
--current_order) {
+ enum fallback_result result;
+
area = &(zone->free_area[current_order]);
- fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, true);
+ result = find_suitable_fallback(area, current_order,
+ start_migratetype, true, &fallback_mt);
- /* No block in that order */
- if (fallback_mt == -1)
+ if (result == FALLBACK_EMPTY)
continue;
- /* Advanced into orders too low to claim, abort */
- if (fallback_mt == -2)
+ if (result == FALLBACK_NOCLAIM)
break;
page = get_page_from_free_area(area, fallback_mt);
@@ -2411,10 +2421,12 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
int fallback_mt;
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
+ enum fallback_result result;
+
area = &(zone->free_area[current_order]);
- fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false);
- if (fallback_mt == -1)
+ result = find_suitable_fallback(area, current_order, start_migratetype,
+ false, &fallback_mt);
+ if (result == FALLBACK_EMPTY)
continue;
page = get_page_from_free_area(area, fallback_mt);
@@ -3424,7 +3436,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
struct zone *zone)
{
int mt;
- unsigned long max_managed, flags;
+ unsigned long max_managed;
/*
* The number reserved as: minimum is 1 pageblock, maximum is
@@ -3438,29 +3450,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order,
if (zone->nr_reserved_highatomic >= max_managed)
return;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
+ return;
/* Yoink! */
mt = get_pageblock_migratetype(page);
/* Only reserve normal pageblocks (i.e., they can merge with others) */
if (!migratetype_is_mergeable(mt))
- goto out_unlock;
+ return;
if (order < pageblock_order) {
if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
+ return;
zone->nr_reserved_highatomic += pageblock_nr_pages;
} else {
change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
zone->nr_reserved_highatomic += 1 << order;
}
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
}
/*
@@ -3476,7 +3485,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
@@ -3493,7 +3501,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
pageblock_nr_pages)
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
unsigned long size;
@@ -3540,12 +3548,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* so this should not fail on zone boundaries.
*/
WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (ret > 0)
return ret;
- }
}
- spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
@@ -4156,7 +4161,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
psi_memstall_leave(&pflags);
delayacct_compact_end();
- if (*compact_result == COMPACT_SKIPPED)
+ if (*compact_result == COMPACT_SKIPPED ||
+ *compact_result == COMPACT_DEFERRED)
return NULL;
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -4193,7 +4199,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order,
+ int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4215,7 +4222,8 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
* migration targets. Continue if reclaim can help.
*/
if (compact_result == COMPACT_SKIPPED) {
- ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags,
+ gfp_mask);
goto out;
}
@@ -4268,7 +4276,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order,
+ int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4678,6 +4687,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
return false;
}
+static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned int order, unsigned long alloc_start_time)
+{
+ static DEFINE_SPINLOCK(alloc_stall_lock);
+ unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time);
+
+ if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS))
+ return;
+ if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies)))
+ return;
+ if (gfp_mask & __GFP_NOWARN)
+ return;
+
+ if (!spin_trylock(&alloc_stall_lock))
+ return;
+
+ /* Check again, this time under the lock */
+ if (time_is_after_jiffies(alloc_stall_warn_jiffies)) {
+ spin_unlock(&alloc_stall_lock);
+ return;
+ }
+
+ WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS));
+ spin_unlock(&alloc_stall_lock);
+
+ pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl",
+ current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
+ cpuset_print_current_mems_allowed();
+ pr_cont("\n");
+ dump_stack();
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -4698,6 +4741,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
bool compact_first = false;
bool can_retry_reserves = true;
+ unsigned long alloc_start_time = jiffies;
if (unlikely(nofail)) {
/*
@@ -4806,13 +4850,27 @@ retry:
}
/* Caller is not willing to reclaim, we can't balance anything */
- if (!can_direct_reclaim)
+ if (!can_direct_reclaim) {
+ /*
+ * Reclaim/compaction cannot run, so defrag_mode's strategy
+ * of enforcing ALLOC_NOFRAGMENT cannot be fulfilled. Allow
+ * fallbacks rather than failing the allocation outright.
+ */
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT) &&
+ (gfp_mask & __GFP_KSWAPD_RECLAIM)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
goto nopage;
+ }
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
+ /* If allocation has taken excessively long, warn about it */
+ check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time);
+
/* Try direct reclaim and then allocating */
if (!compact_first) {
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
@@ -4886,9 +4944,9 @@ retry:
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 && can_compact &&
- should_compact_retry(ac, order, alloc_flags,
- compact_result, &compact_priority,
- &compaction_retries))
+ should_compact_retry(gfp_mask, ac, order, alloc_flags,
+ compact_result, &compact_priority,
+ &compaction_retries))
goto retry;
/* Reclaim/compaction failed to prevent the fallback */
@@ -5044,7 +5102,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct per_cpu_pages *pcp;
struct list_head *pcp_list;
struct alloc_context ac;
- gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
int nr_populated = 0, nr_account = 0;
@@ -5085,10 +5142,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
gfp &= gfp_allowed_mask;
- alloc_gfp = gfp;
- if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags))
goto out;
- gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
z = ac.preferred_zoneref;
@@ -5180,6 +5235,34 @@ failed:
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
+ * free_pages_bulk - Free an array of order-0 pages
+ * @page_array: Array of pages to free
+ * @nr_pages: The number of pages in the array
+ *
+ * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous
+ * run are released with a single __free_contig_range() call.
+ *
+ * This assumes page_array is sorted in ascending PFN order. Without that,
+ * the function still frees all pages, but contiguous runs may not be
+ * detected and the freeing pattern can degrade to freeing one page at a
+ * time.
+ *
+ * Context: Sleepable process context only; calls cond_resched()
+ */
+void free_pages_bulk(struct page **page_array, unsigned long nr_pages)
+{
+ while (nr_pages) {
+ unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages);
+
+ __free_contig_range(page_to_pfn(*page_array), nr_contig);
+
+ nr_pages -= nr_contig;
+ page_array += nr_contig;
+ cond_resched();
+ }
+}
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
@@ -6758,6 +6841,105 @@ void __init page_alloc_sysctl_init(void)
register_sysctl_init("vm", page_alloc_sysctl_table);
}
+static void free_prepared_contig_range(struct page *page,
+ unsigned long nr_pages)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ while (nr_pages) {
+ unsigned int order;
+
+ /* We are limited by the largest buddy order. */
+ order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER;
+ /* Don't exceed the number of pages to free. */
+ order = min_t(unsigned int, order, ilog2(nr_pages));
+ order = min_t(unsigned int, order, MAX_PAGE_ORDER);
+
+ /*
+ * Free the chunk as a single block. Our caller has already
+ * called free_pages_prepare() for each order-0 page.
+ */
+ __free_frozen_pages(page, order, FPI_PREPARED);
+
+ pfn += 1UL << order;
+ page += 1UL << order;
+ nr_pages -= 1UL << order;
+ }
+}
+
+static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages,
+ bool is_frozen)
+{
+ struct page *page, *start = NULL;
+ unsigned long nr_start = 0;
+ unsigned long start_sec;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++) {
+ bool can_free = true;
+
+ /*
+ * Contiguous PFNs might not have contiguous "struct pages"
+ * in some kernel configs: page++ across a section boundary
+ * is undefined. Use pfn_to_page() for each PFN.
+ */
+ page = pfn_to_page(pfn + i);
+
+ VM_WARN_ON_ONCE(PageHead(page));
+ VM_WARN_ON_ONCE(PageTail(page));
+
+ if (!is_frozen)
+ can_free = put_page_testzero(page);
+
+ if (can_free)
+ can_free = free_pages_prepare(page, 0);
+
+ if (!can_free) {
+ if (start) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = NULL;
+ }
+ continue;
+ }
+
+ if (start && memdesc_section(page->flags) != start_sec) {
+ free_prepared_contig_range(start, i - nr_start);
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ } else if (!start) {
+ start = page;
+ nr_start = i;
+ start_sec = memdesc_section(page->flags);
+ }
+ }
+
+ if (start)
+ free_prepared_contig_range(start, nr_pages - nr_start);
+}
+
+/**
+ * __free_contig_range - Free contiguous range of order-0 pages.
+ * @pfn: Page frame number of the first page in the range.
+ * @nr_pages: Number of pages to free.
+ *
+ * For each order-0 struct page in the physically contiguous range, put a
+ * reference. Free any page who's reference count falls to zero. The
+ * implementation is functionally equivalent to, but significantly faster than
+ * calling __free_page() for each struct page in a loop.
+ *
+ * Memory allocated with alloc_pages(order>=1) then subsequently split to
+ * order-0 with split_page() is an example of appropriate contiguous pages that
+ * can be freed with this API.
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
+void __free_contig_range(unsigned long pfn, unsigned long nr_pages)
+{
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
@@ -6895,8 +7077,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
{
- for (; nr_pages--; pfn++)
- free_frozen_pages(pfn_to_page(pfn), 0);
+ __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true);
}
/**
@@ -7304,8 +7485,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
return;
- for (; nr_pages--; pfn++)
- __free_page(pfn_to_page(pfn));
+ __free_contig_range(pfn, nr_pages);
}
EXPORT_SYMBOL(free_contig_range);
#endif /* CONFIG_CONTIG_ALLOC */
@@ -7363,7 +7543,7 @@ void zone_pcp_reset(struct zone *zone)
unsigned long __offline_isolated_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long already_offline = 0, flags;
+ unsigned long already_offline = 0;
unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
@@ -7371,7 +7551,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
while (pfn < end_pfn) {
page = pfn_to_page(pfn);
/*
@@ -7401,7 +7581,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn,
del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
pfn += (1 << order);
}
- spin_unlock_irqrestore(&zone->lock, flags);
return end_pfn - start_pfn - already_offline;
}
@@ -7473,11 +7652,9 @@ bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
int page_order = buddy_order(page_head);
@@ -7492,14 +7669,12 @@ bool take_page_off_buddy(struct page *page)
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
SetPageHWPoisonTakenOff(page);
- ret = true;
- break;
+ return true;
}
if (page_count(page_head) > 0)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
/*
@@ -7508,23 +7683,19 @@ bool take_page_off_buddy(struct page *page)
bool put_page_back_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
- unsigned long flags;
- bool ret = false;
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (put_page_testzero(page)) {
unsigned long pfn = page_to_pfn(page);
int migratetype = get_pfnblock_migratetype(page, pfn);
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
- if (TestClearPageHWPoison(page)) {
- ret = true;
- }
+ if (TestClearPageHWPoison(page))
+ return true;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
+ return false;
}
#endif
@@ -7774,8 +7945,8 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned
* @order: allocation order size
*
* Allocates pages of a given order from the given node. This is safe to
- * call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * call from any context where RCU is watching (from atomic, NMI, and also
+ * reentrant allocator -> tracepoint -> alloc_pages_nolock_noprof).
* Allocation is best effort and to be expected to fail easily so nobody should
* rely on the success. Failures are not reported via warn_alloc().
* See always fail conditions below.
diff --git a/mm/page_io.c b/mm/page_io.c
index a59b73f8bdd9..60977c970cdf 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,6 +26,7 @@
#include <linux/delayacct.h>
#include <linux/zswap.h>
#include "swap.h"
+#include "swap_table.h"
static void __end_swap_bio_write(struct bio *bio)
{
@@ -204,15 +205,20 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
- struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
int nr_pages = folio_nr_pages(folio);
+ struct swap_cluster_info *ci;
swp_entry_t entry;
unsigned int i;
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+
+ ci = swap_cluster_get_and_lock(folio);
for (i = 0; i < folio_nr_pages(folio); i++) {
entry = page_swap_entry(folio_page(folio, i));
- set_bit(swp_offset(entry), sis->zeromap);
+ __swap_table_set_zero(ci, swp_cluster_offset(entry));
}
+ swap_cluster_unlock(ci);
count_vm_events(SWPOUT_ZERO, nr_pages);
if (objcg) {
@@ -223,14 +229,19 @@ static void swap_zeromap_folio_set(struct folio *folio)
static void swap_zeromap_folio_clear(struct folio *folio)
{
- struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
+ struct swap_cluster_info *ci;
swp_entry_t entry;
unsigned int i;
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+
+ ci = swap_cluster_get_and_lock(folio);
for (i = 0; i < folio_nr_pages(folio); i++) {
entry = page_swap_entry(folio_page(folio, i));
- clear_bit(swp_offset(entry), sis->zeromap);
+ __swap_table_clear_zero(ci, swp_cluster_offset(entry));
}
+ swap_cluster_unlock(ci);
}
/*
@@ -255,10 +266,9 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
}
/*
- * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages.
- * The bits in zeromap are protected by the locked swapcache folio
- * and atomic updates are used to protect against read-modify-write
- * corruption due to other zero swap entries seeing concurrent updates.
+ * Use the swap table zero mark to avoid doing IO for zero-filled
+ * pages. The zero mark is protected by the cluster lock, which is
+ * acquired internally by swap_zeromap_folio_set/clear.
*/
if (is_folio_zero_filled(folio)) {
swap_zeromap_folio_set(folio);
@@ -326,8 +336,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
struct swap_iocb {
struct kiocb iocb;
- struct bio_vec bvec[SWAP_CLUSTER_MAX];
- int pages;
+ struct bio_vec bvecs[SWAP_CLUSTER_MAX];
+ int nr_bvecs;
int len;
};
static mempool_t *sio_pool;
@@ -348,7 +358,7 @@ int sio_pool_init(void)
static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec[0].bv_page;
+ struct page *page = sio->bvecs[0].bv_page;
int p;
if (ret != sio->len) {
@@ -362,15 +372,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
*/
pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
ret, swap_dev_pos(page_swap_entry(page)));
- for (p = 0; p < sio->pages; p++) {
- page = sio->bvec[p].bv_page;
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ page = sio->bvecs[p].bv_page;
set_page_dirty(page);
ClearPageReclaim(page);
}
}
- for (p = 0; p < sio->pages; p++)
- end_page_writeback(sio->bvec[p].bv_page);
+ for (p = 0; p < sio->nr_bvecs; p++)
+ end_page_writeback(sio->bvecs[p].bv_page);
mempool_free(sio, sio_pool);
}
@@ -397,13 +407,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
init_sync_kiocb(&sio->iocb, swap_file);
sio->iocb.ki_complete = sio_write_complete;
sio->iocb.ki_pos = pos;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) {
swap_write_unplug(sio);
sio = NULL;
}
@@ -477,7 +487,7 @@ void swap_write_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -489,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
int p;
if (ret == sio->len) {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = bvec_folio(&sio->bvec[p]);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = bvec_folio(&sio->bvecs[p]);
count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
@@ -499,8 +509,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
}
count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
} else {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = bvec_folio(&sio->bvec[p]);
+ for (p = 0; p < sio->nr_bvecs; p++) {
+ struct folio *folio = bvec_folio(&sio->bvecs[p]);
folio_unlock(folio);
}
@@ -509,19 +519,52 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
+/*
+ * Return the count of contiguous swap entries that share the same
+ * zeromap status as the starting entry. If is_zerop is not NULL,
+ * it will return the zeromap status of the starting entry.
+ *
+ * Context: Caller must ensure the cluster containing the entries
+ * that will be checked won't be freed.
+ */
+static int swap_zeromap_batch(swp_entry_t entry, int max_nr,
+ bool *is_zerop)
+{
+ int i;
+ bool is_zero;
+ unsigned int ci_start = swp_cluster_offset(entry);
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+
+ VM_WARN_ON_ONCE(ci_start + max_nr > SWAPFILE_CLUSTER);
+
+ rcu_read_lock();
+ is_zero = __swap_table_test_zero(ci, ci_start);
+ for (i = 1; i < max_nr; i++)
+ if (is_zero != __swap_table_test_zero(ci, ci_start + i))
+ break;
+ rcu_read_unlock();
+ if (is_zerop)
+ *is_zerop = is_zero;
+
+ return i;
+}
+
static bool swap_read_folio_zeromap(struct folio *folio)
{
int nr_pages = folio_nr_pages(folio);
struct obj_cgroup *objcg;
bool is_zeromap;
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+
/*
* Swapping in a large folio that is partially in the zeromap is not
* currently handled. Return true without marking the folio uptodate so
* that an IO error is emitted (e.g. do_swap_page() will sigbus).
+ * Folio lock stabilizes the cluster and map, so the check is safe.
*/
if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages,
- &is_zeromap) != nr_pages))
+ &is_zeromap) != nr_pages))
return true;
if (!is_zeromap)
@@ -559,13 +602,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
init_sync_kiocb(&sio->iocb, sis->swap_file);
sio->iocb.ki_pos = pos;
sio->iocb.ki_complete = sio_read_complete;
- sio->pages = 0;
+ sio->nr_bvecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ sio->nr_bvecs += 1;
+ if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) {
swap_read_unplug(sio);
sio = NULL;
}
@@ -666,7 +709,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
- iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
+ iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_read_complete(&sio->iocb, ret);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c48ff5c00244..7a9d631945a3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
{
struct zone *zone = page_zone(page);
struct page *unmovable;
- unsigned long flags;
unsigned long check_unmovable_start, check_unmovable_end;
if (PageUnaccepted(page))
accept_page(page);
- spin_lock_irqsave(&zone->lock, flags);
-
- /*
- * We assume the caller intended to SET migrate type to isolate.
- * If it is already set, then someone else must have raced and
- * set it before us.
- */
- if (is_migrate_isolate_page(page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return -EBUSY;
- }
-
- /*
- * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
- * We just check MOVABLE pages.
- *
- * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
- * to avoid redundant checks.
- */
- check_unmovable_start = max(page_to_pfn(page), start_pfn);
- check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
- end_pfn);
-
- unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
- mode);
- if (!unmovable) {
- if (!pageblock_isolate_and_move_free_pages(zone, page)) {
- spin_unlock_irqrestore(&zone->lock, flags);
+ scoped_guard(spinlock_irqsave, &zone->lock) {
+ /*
+ * We assume the caller intended to SET migrate type to
+ * isolate. If it is already set, then someone else must have
+ * raced and set it before us.
+ */
+ if (is_migrate_isolate_page(page))
return -EBUSY;
+
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by
+ * itself. We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's
+ * pageblock to avoid redundant checks.
+ */
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
+ end_pfn);
+
+ unmovable = has_unmovable_pages(check_unmovable_start,
+ check_unmovable_end, mode);
+ if (!unmovable) {
+ if (!pageblock_isolate_and_move_free_pages(zone, page))
+ return -EBUSY;
+ zone->nr_isolate_pageblock++;
+ return 0;
}
- zone->nr_isolate_pageblock++;
- spin_unlock_irqrestore(&zone->lock, flags);
- return 0;
}
-
- spin_unlock_irqrestore(&zone->lock, flags);
if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
/*
* printk() with zone->lock held will likely trigger a
@@ -223,15 +215,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
static void unset_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
bool isolated_page = false;
unsigned int order;
struct page *buddy;
zone = page_zone(page);
- spin_lock_irqsave(&zone->lock, flags);
+ guard(spinlock_irqsave)(&zone->lock);
if (!is_migrate_isolate_page(page))
- goto out;
+ return;
/*
* Because freepage with more than pageblock_order on isolated
@@ -279,8 +270,6 @@ static void unset_migratetype_isolate(struct page *page)
__putback_isolated_page(page, order, get_pageblock_migratetype(page));
}
zone->nr_isolate_pageblock--;
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8178e0be557f..2dddcb6510aa 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
- &page->flags);
+ &page->flags.f);
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4d52fdb3056..2ccbabfb2cc1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -41,7 +41,7 @@ again:
if (!pvmw->pte)
return false;
- ptent = ptep_get(pvmw->pte);
+ ptent = ptep_get_lockless(pvmw->pte);
if (pte_none(ptent)) {
return false;
@@ -183,6 +183,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
struct mm_struct *mm = vma->vm_mm;
unsigned long end;
spinlock_t *ptl;
+ pte_t pteval;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -310,7 +311,11 @@ next_pte:
goto restart;
}
pvmw->pte++;
- } while (pte_none(ptep_get(pvmw->pte)));
+ if (!pvmw->ptl)
+ pteval = ptep_get_lockless(pvmw->pte);
+ else
+ pteval = ptep_get(pvmw->pte);
+ } while (pte_none(pteval));
if (!pvmw->ptl) {
spin_lock(ptl);
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 4b3d6ec43703..8cbe039bf847 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -77,13 +77,13 @@ struct pcpu_chunk {
int end_offset; /* additional area required to
have the region end page
aligned */
+ int nr_pages; /* # of pages served by this chunk */
+ int nr_populated; /* # of populated pages */
+ int nr_empty_pop_pages; /* # of empty populated pages */
#ifdef NEED_PCPUOBJ_EXT
struct pcpuobj_ext *obj_exts; /* vector of object cgroups */
#endif
- int nr_pages; /* # of pages served by this chunk */
- int nr_populated; /* # of populated pages */
- int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */
};
diff --git a/mm/readahead.c b/mm/readahead.c
index 7b05082c89ea..38ce16e3fcbd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -146,6 +146,17 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
+/**
+ * read_pages() - Start IO for a contiguous range of allocated folios in the
+ * page cache.
+ * @rac: Readahead control.
+ *
+ * When read_pages() returns, it is guaranteed that all of the folios will have
+ * been processed or removed so that ``readahead_count(rac) == 0``. However,
+ * that does not imply that ``readahead_index(rac)`` will be updated to point
+ * to the end of the originally requested range because, for example, the
+ * filesystem may expand the range upwards.
+ */
static void read_pages(struct readahead_control *rac)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
@@ -270,7 +281,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
read_pages(ractl);
ractl->_index += min_nrpages;
- i = ractl->_index + ractl->_nr_pages - index;
+ i = ractl->_index - index;
continue;
}
@@ -286,7 +297,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
break;
read_pages(ractl);
ractl->_index += min_nrpages;
- i = ractl->_index + ractl->_nr_pages - index;
+ i = ractl->_index - index;
continue;
}
if (i == mark)
@@ -324,11 +335,16 @@ static void do_page_cache_ra(struct readahead_control *ractl,
return;
end_index = (isize - 1) >> PAGE_SHIFT;
+ if (end_index > ractl->_max_index)
+ end_index = ractl->_max_index;
if (index > end_index)
return;
/* Don't read past the page containing the last byte of the file */
- if (nr_to_read > end_index - index)
+ if (nr_to_read > end_index - index) {
nr_to_read = end_index - index + 1;
+ /* We've reached the end, so don't set a readahead marker. */
+ lookahead_size = 0;
+ }
filemap_invalidate_lock_shared(mapping);
page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
@@ -471,8 +487,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
pgoff_t start = readahead_index(ractl);
pgoff_t index = start;
unsigned int min_order = mapping_min_folio_order(mapping);
- pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
- pgoff_t mark = index + ra->size - ra->async_size;
+ pgoff_t limit;
+ pgoff_t mark;
unsigned int nofs;
int err = 0;
gfp_t gfp = readahead_gfp_mask(mapping);
@@ -484,7 +500,15 @@ void page_cache_ra_order(struct readahead_control *ractl,
goto fallback;
}
- limit = min(limit, index + ra->size - 1);
+ limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ limit = min(limit, ractl->_max_index);
+ if (limit > index + ra->size - 1) {
+ limit = index + ra->size - 1;
+ mark = index + ra->size - ra->async_size;
+ } else {
+ /* We've reached the end, so don't set a readahead marker. */
+ mark = ULONG_MAX;
+ }
new_order = min(mapping_max_folio_order(mapping), new_order);
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
diff --git a/mm/rmap.c b/mm/rmap.c
index 99e1b3dc390b..1c77d5dc06e9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -571,7 +571,7 @@ void __init anon_vma_init(void)
* In case it was remapped to a different anon_vma, the new anon_vma will be a
* child of the old anon_vma, and the anon_vma lifetime rules will therefore
* ensure that any anon_vma obtained from the page will still be valid for as
- * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ * long as we observe folio_mapped() [ hence all those folio_mapped() tests ].
*
* All users of this function must be very careful when walking the anon_vma
* chain and verify that the page in question is indeed mapped in it
@@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_unmap() may return before page_mapped() has become false,
+ * try_to_unmap() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
- * try_to_migrate() may return before page_mapped() has become false,
+ * try_to_migrate() may return before folio_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
@@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
/*
* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
- * because that depends on page_mapped(); but not all its usages
+ * because that depends on folio_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 7b1ea9fb598f..b51f83c970bb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void)
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, vm_fault_t *fault_type);
+ struct vm_fault *vmf, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -1789,30 +1789,6 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
return folio;
}
-/*
- * Make sure huge_gfp is always more limited than limit_gfp.
- * Some of the flags set permissions, while others set limitations.
- */
-static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
-{
- gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
- gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
- gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
- gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
-
- /* Allow allocations only from the originally specified zones. */
- result |= zoneflags;
-
- /*
- * Minimize the result gfp by taking the union with the deny flags,
- * and the intersection of the allow flags.
- */
- result |= (limit_gfp & denyflags);
- result |= (huge_gfp & limit_gfp) & allowflags;
-
- return result;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_hpage_pmd_enabled(void)
{
@@ -2039,68 +2015,32 @@ unlock:
}
static struct folio *shmem_swap_alloc_folio(struct inode *inode,
- struct vm_area_struct *vma, pgoff_t index,
+ struct vm_fault *vmf, pgoff_t index,
swp_entry_t entry, int order, gfp_t gfp)
{
+ pgoff_t ilx;
+ struct folio *folio;
+ struct mempolicy *mpol;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct folio *new, *swapcache;
- int nr_pages = 1 << order;
- gfp_t alloc_gfp = gfp;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- if (WARN_ON_ONCE(order))
- return ERR_PTR(-EINVAL);
- } else if (order) {
- /*
- * If uffd is active for the vma, we need per-page fault
- * fidelity to maintain the uffd semantics, then fallback
- * to swapin order-0 folio, as well as for zswap case.
- * Any existing sub folio in the swap cache also blocks
- * mTHP swapin.
- */
- if ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled() ||
- non_swapcache_batch(entry, nr_pages) != nr_pages)
- goto fallback;
- alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
- }
-retry:
- new = shmem_alloc_folio(alloc_gfp, order, info, index);
- if (!new) {
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+ if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) ||
+ !zswap_never_enabled())
+ order = 0;
- if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- alloc_gfp, entry)) {
- folio_put(new);
- new = ERR_PTR(-ENOMEM);
- goto fallback;
- }
+again:
+ mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
+ folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx);
+ mpol_cond_put(mpol);
- swapcache = swapin_folio(entry, new);
- if (swapcache != new) {
- folio_put(new);
- if (!swapcache) {
- /*
- * The new folio is charged already, swapin can
- * only fail due to another raced swapin.
- */
- new = ERR_PTR(-EEXIST);
- goto fallback;
- }
+ if (!IS_ERR(folio))
+ return folio;
+
+ if (order) {
+ order = 0;
+ goto again;
}
- return swapcache;
-fallback:
- /* Order 0 swapin failed, nothing to fallback to, abort */
- if (!order)
- return new;
- entry.val += index - round_down(index, nr_pages);
- alloc_gfp = gfp;
- nr_pages = 1;
- order = 0;
- goto retry;
+
+ return folio;
}
/*
@@ -2139,7 +2079,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
if (nr_pages > 1) {
gfp_t huge_gfp = vma_thp_gfp_mask(vma);
- gfp = limit_gfp_mask(huge_gfp, gfp);
+ gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp);
}
#endif
@@ -2287,11 +2227,12 @@ unlock:
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
+ gfp_t gfp, struct vm_fault *vmf,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+ struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swap;
softleaf_t index_entry;
@@ -2332,20 +2273,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
/* Direct swapin skipping swap cache & readahead */
- folio = shmem_swap_alloc_folio(inode, vma, index,
- index_entry, order, gfp);
- if (IS_ERR(folio)) {
- error = PTR_ERR(folio);
- folio = NULL;
- goto failed;
- }
+ folio = shmem_swap_alloc_folio(inode, vmf, index,
+ swap, order, gfp);
} else {
/* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
+ }
+ if (IS_ERR_OR_NULL(folio)) {
+ if (IS_ERR(folio))
+ error = PTR_ERR(folio);
+ else
error = -ENOMEM;
- goto failed;
- }
+ folio = NULL;
+ goto failed;
}
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
@@ -2493,7 +2433,7 @@ repeat:
if (xa_is_value(folio)) {
error = shmem_swapin_folio(inode, index, &folio,
- sgp, gfp, vma, fault_type);
+ sgp, gfp, vmf, fault_type);
if (error == -EEXIST)
goto repeat;
@@ -2546,7 +2486,7 @@ repeat:
gfp_t huge_gfp;
huge_gfp = vma_thp_gfp_mask(vma);
- huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ huge_gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp);
folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
inode, index, fault_mm, orders);
if (!IS_ERR(folio)) {
@@ -3100,10 +3040,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
cache_no_acl(inode);
if (sbinfo->noswap)
mapping_set_unevictable(inode->i_mapping);
-
- /* Don't consider 'deny' for emergencies and 'force' for testing */
- if (sbinfo->huge)
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@@ -5510,24 +5447,74 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
static DEFINE_SPINLOCK(huge_shmem_orders_lock);
+enum huge_mode {
+ HUGE_SHMEM_ENABLED_ALWAYS = 0,
+ HUGE_SHMEM_ENABLED_INHERIT,
+ HUGE_SHMEM_ENABLED_WITHIN_SIZE,
+ HUGE_SHMEM_ENABLED_ADVISE,
+ HUGE_SHMEM_ENABLED_NEVER,
+};
+
+static const char * const huge_mode_strings[] = {
+ [HUGE_SHMEM_ENABLED_ALWAYS] = "always",
+ [HUGE_SHMEM_ENABLED_INHERIT] = "inherit",
+ [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = "within_size",
+ [HUGE_SHMEM_ENABLED_ADVISE] = "advise",
+ [HUGE_SHMEM_ENABLED_NEVER] = "never",
+};
+
+static unsigned long * const huge_mode_orders[] = {
+ [HUGE_SHMEM_ENABLED_ALWAYS] = &huge_shmem_orders_always,
+ [HUGE_SHMEM_ENABLED_INHERIT] = &huge_shmem_orders_inherit,
+ [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = &huge_shmem_orders_within_size,
+ [HUGE_SHMEM_ENABLED_ADVISE] = &huge_shmem_orders_madvise,
+};
+
static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
int order = to_thpsize(kobj)->order;
- const char *output;
-
- if (test_bit(order, &huge_shmem_orders_always))
- output = "[always] inherit within_size advise never";
- else if (test_bit(order, &huge_shmem_orders_inherit))
- output = "always [inherit] within_size advise never";
- else if (test_bit(order, &huge_shmem_orders_within_size))
- output = "always inherit [within_size] advise never";
- else if (test_bit(order, &huge_shmem_orders_madvise))
- output = "always inherit within_size [advise] never";
- else
- output = "always inherit within_size advise [never]";
+ int active = HUGE_SHMEM_ENABLED_NEVER;
+ int len = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(huge_mode_orders); i++) {
+ if (test_bit(order, huge_mode_orders[i])) {
+ active = i;
+ break;
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(huge_mode_strings); i++) {
+ if (i == active)
+ len += sysfs_emit_at(buf, len, "[%s] ",
+ huge_mode_strings[i]);
+ else
+ len += sysfs_emit_at(buf, len, "%s ",
+ huge_mode_strings[i]);
+ }
+
+ /* Replace trailing space with newline */
+ buf[len - 1] = '\n';
+
+ return len;
+}
+
+static bool set_shmem_enabled_mode(int order, enum huge_mode mode)
+{
+ bool changed = false;
+ enum huge_mode idx;
+
+ spin_lock(&huge_shmem_orders_lock);
+ for (idx = 0; idx < ARRAY_SIZE(huge_mode_orders); idx++) {
+ if (idx == mode)
+ changed |= !__test_and_set_bit(order, huge_mode_orders[idx]);
+ else
+ changed |= __test_and_clear_bit(order, huge_mode_orders[idx]);
+ }
+ spin_unlock(&huge_shmem_orders_lock);
- return sysfs_emit(buf, "%s\n", output);
+ return changed;
}
static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
@@ -5535,58 +5522,31 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
const char *buf, size_t count)
{
int order = to_thpsize(kobj)->order;
- ssize_t ret = count;
-
- if (sysfs_streq(buf, "always")) {
- spin_lock(&huge_shmem_orders_lock);
- clear_bit(order, &huge_shmem_orders_inherit);
- clear_bit(order, &huge_shmem_orders_madvise);
- clear_bit(order, &huge_shmem_orders_within_size);
- set_bit(order, &huge_shmem_orders_always);
- spin_unlock(&huge_shmem_orders_lock);
- } else if (sysfs_streq(buf, "inherit")) {
- /* Do not override huge allocation policy with non-PMD sized mTHP */
- if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order))
- return -EINVAL;
+ int mode;
- spin_lock(&huge_shmem_orders_lock);
- clear_bit(order, &huge_shmem_orders_always);
- clear_bit(order, &huge_shmem_orders_madvise);
- clear_bit(order, &huge_shmem_orders_within_size);
- set_bit(order, &huge_shmem_orders_inherit);
- spin_unlock(&huge_shmem_orders_lock);
- } else if (sysfs_streq(buf, "within_size")) {
- spin_lock(&huge_shmem_orders_lock);
- clear_bit(order, &huge_shmem_orders_always);
- clear_bit(order, &huge_shmem_orders_inherit);
- clear_bit(order, &huge_shmem_orders_madvise);
- set_bit(order, &huge_shmem_orders_within_size);
- spin_unlock(&huge_shmem_orders_lock);
- } else if (sysfs_streq(buf, "advise")) {
- spin_lock(&huge_shmem_orders_lock);
- clear_bit(order, &huge_shmem_orders_always);
- clear_bit(order, &huge_shmem_orders_inherit);
- clear_bit(order, &huge_shmem_orders_within_size);
- set_bit(order, &huge_shmem_orders_madvise);
- spin_unlock(&huge_shmem_orders_lock);
- } else if (sysfs_streq(buf, "never")) {
- spin_lock(&huge_shmem_orders_lock);
- clear_bit(order, &huge_shmem_orders_always);
- clear_bit(order, &huge_shmem_orders_inherit);
- clear_bit(order, &huge_shmem_orders_within_size);
- clear_bit(order, &huge_shmem_orders_madvise);
- spin_unlock(&huge_shmem_orders_lock);
- } else {
- ret = -EINVAL;
- }
+ mode = sysfs_match_string(huge_mode_strings, buf);
+ if (mode < 0)
+ return mode;
- if (ret > 0) {
- int err = start_stop_khugepaged();
+ /* Do not override huge allocation policy with non-PMD sized mTHP */
+ if (mode == HUGE_SHMEM_ENABLED_INHERIT &&
+ shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order))
+ return -EINVAL;
+ if (set_shmem_enabled_mode(order, mode)) {
+ int err = start_stop_khugepaged();
if (err)
- ret = err;
+ return err;
+ } else {
+ /*
+ * Recalculate watermarks even when the mode hasn't changed
+ * to preserve the legacy behavior, as this is always called
+ * inside start_stop_khugepaged().
+ */
+ set_recommended_min_free_kbytes();
}
- return ret;
+
+ return count;
}
struct kobj_attribute thpsize_shmem_enabled_attr =
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 76b3f750cf65..7082d01c8c9d 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -197,12 +197,13 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
- struct shrinker_info_unit *unit;
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
- unit = info->unit[shrinker_id_to_index(shrinker_id)];
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
+ struct shrinker_info_unit *unit;
+
+ unit = info->unit[shrinker_id_to_index(shrinker_id)];
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
@@ -215,29 +216,26 @@ static DEFINE_IDR(shrinker_idr);
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
- int id, ret = -ENOMEM;
+ int id;
if (mem_cgroup_disabled())
return -ENOSYS;
if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB))
return -ENOSYS;
- mutex_lock(&shrinker_mutex);
+ guard(mutex)(&shrinker_mutex);
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
- goto unlock;
+ return id;
if (id >= shrinker_nr_max) {
if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
- goto unlock;
+ return -ENOMEM;
}
}
shrinker->id = id;
- ret = 0;
-unlock:
- mutex_unlock(&shrinker_mutex);
- return ret;
+ return 0;
}
static void shrinker_memcg_remove(struct shrinker *shrinker)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 6eadb9d116e4..99e2be39671b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap)
{
- void *ptr;
-
if (altmap)
return altmap_alloc_block_buf(size, altmap);
- ptr = sparse_buffer_alloc(size);
- if (!ptr)
- ptr = vmemmap_alloc_block(size, node);
- return ptr;
+ return vmemmap_alloc_block(size, node);
}
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
@@ -151,7 +146,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
unsigned long ptpfn, unsigned long flags)
{
@@ -195,7 +190,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
return p;
}
-pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
@@ -208,7 +203,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
+static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
@@ -221,7 +216,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
return pud;
}
-p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
@@ -234,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
return p4d;
}
-pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
@@ -391,12 +386,17 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
unsigned long addr, unsigned long next)
{
+ WARN_ON_ONCE(!pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL));
}
int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
unsigned long addr, unsigned long next)
{
- return 0;
+ if (!pmd_leaf(pmdp_get(pmd)))
+ return 0;
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+
+ return 1;
}
int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
@@ -652,26 +652,61 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
+static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0;
+ const unsigned long pages_per_compound = 1UL << order;
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
+ VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
+
+ if (!vmemmap_can_optimize(altmap, pgmap))
+ return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
+
+ if (order < PFN_SECTION_SHIFT) {
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound));
+ return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound;
+ }
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION));
+
+ if (IS_ALIGNED(pfn, pages_per_compound))
+ return VMEMMAP_RESERVE_NR;
+
+ return 0;
+}
+
static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
- return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
+ struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap,
+ pgmap);
+
+ memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
+
+ return page;
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
+ memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
vmemmap_free(start, end, altmap);
}
+
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long pfn = page_to_pfn(memmap);
+ memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
+ NULL, NULL));
vmemmap_free(start, end, NULL);
}
@@ -737,7 +772,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
* usage map, but still need to free the vmemmap range.
*/
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
@@ -774,14 +809,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early) {
- memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
- depopulate_section_memmap(pfn, nr_pages, altmap);
- } else if (memmap) {
- memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
- PAGE_SIZE)));
+ if (!section_is_early)
+ depopulate_section_memmap(pfn, nr_pages, altmap, pgmap);
+ else if (memmap)
free_map_bootmem(memmap);
- }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -823,10 +854,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
if (!memmap) {
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
return ERR_PTR(-ENOMEM);
}
- memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
@@ -885,13 +915,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
}
void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- section_deactivate(pfn, nr_pages, altmap);
+ section_deactivate(pfn, nr_pages, altmap, pgmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab1..16ac6df3c89f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -201,13 +201,11 @@ static void __init memblocks_present(void)
int i, nid;
#ifdef CONFIG_SPARSEMEM_EXTREME
- if (unlikely(!mem_section)) {
- unsigned long size, align;
+ unsigned long size, align;
- size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
- align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc_or_panic(size, align);
- }
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_alloc_or_panic(size, align);
#endif
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
@@ -241,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
struct dev_pagemap *pgmap)
{
unsigned long size = section_map_size();
- struct page *map = sparse_buffer_alloc(size);
+ struct page *map;
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- if (map)
- return map;
-
map = memmap_alloc(size, size, addr, nid, false);
if (!map)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
@@ -256,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-static void *sparsemap_buf __meminitdata;
-static void *sparsemap_buf_end __meminitdata;
-
-static inline void __meminit sparse_buffer_free(unsigned long size)
-{
- WARN_ON(!sparsemap_buf || size == 0);
- memblock_free(sparsemap_buf, size);
-}
-
-static void __init sparse_buffer_init(unsigned long size, int nid)
-{
- phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
- WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
- /*
- * Pre-allocated buffer is mainly used by __populate_section_memmap
- * and we want it to be properly aligned to the section size - this is
- * especially the case for VMEMMAP which maps memmap to PMDs
- */
- sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
- sparsemap_buf_end = sparsemap_buf + size;
-}
-
-static void __init sparse_buffer_fini(void)
-{
- unsigned long size = sparsemap_buf_end - sparsemap_buf;
-
- if (sparsemap_buf && size > 0)
- sparse_buffer_free(size);
- sparsemap_buf = NULL;
-}
-
-void * __meminit sparse_buffer_alloc(unsigned long size)
-{
- void *ptr = NULL;
-
- if (sparsemap_buf) {
- ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
- if (ptr + size > sparsemap_buf_end)
- ptr = NULL;
- else {
- /* Free redundant aligned space */
- if ((unsigned long)(ptr - sparsemap_buf) > 0)
- sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
- sparsemap_buf = ptr + size;
- }
- }
- return ptr;
-}
-
void __weak __meminit vmemmap_populate_print_last(void)
{
}
@@ -362,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
goto failed;
}
- sparse_buffer_init(map_count * section_map_size(), nid);
-
sparse_vmemmap_init_nid_early(nid);
for_each_present_section_nr(pnum_begin, pnum) {
@@ -381,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
__func__, nid);
pnum_begin = pnum;
sparse_usage_fini();
- sparse_buffer_fini();
goto failed;
}
memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
@@ -390,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
}
}
sparse_usage_fini();
- sparse_buffer_fini();
return;
failed:
/*
diff --git a/mm/swap.c b/mm/swap.c
index 5cc44f0de987..588f50d8f1a8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -160,13 +160,41 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
+ struct folio_batch free_fbatch;
+ bool is_lru_add = (move_fn == lru_add);
+
+ /*
+ * If we're adding to the LRU, preemptively filter dead folios. Use
+ * this dedicated folio batch for temp storage and deferred cleanup.
+ */
+ if (is_lru_add)
+ folio_batch_init(&free_fbatch);
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
/* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add && !folio_test_clear_lru(folio))
+ if (!is_lru_add && !folio_test_clear_lru(folio))
+ continue;
+
+ /*
+ * Filter dead folios by moving them from the add batch to the temp
+ * batch for freeing after this loop.
+ *
+ * We're bypassing normal cleanup. Clear flags that are not
+ * applicable to dead folios.
+ *
+ * Since the folio may be part of a huge page, unqueue from
+ * deferred split list to avoid a dangling list entry.
+ */
+ if (is_lru_add && folio_ref_freeze(folio, 1)) {
+ __folio_clear_active(folio);
+ __folio_clear_unevictable(folio);
+ folio_unqueue_deferred_split(folio);
+ fbatch->folios[i] = NULL;
+ folio_batch_add(&free_fbatch, folio);
continue;
+ }
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec)
lruvec_unlock_irqrestore(lruvec, flags);
+
+ /* Cleanup filtered dead folios. */
+ if (is_lru_add) {
+ mem_cgroup_uncharge_folios(&free_fbatch);
+ free_unref_folios(&free_fbatch);
+ }
+
folios_put(fbatch);
}
@@ -509,10 +544,20 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- /* see the comment in lru_gen_folio_seq() */
+ /*
+ * For refaulted workingset folios, set PG_active so they
+ * can be added to active generations.
+ * For prefaulted file folios, folio_mark_accessed() sets
+ * PG_referenced so lru_gen_folio_seq() places them into
+ * the second oldest generation.
+ */
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
- lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
- folio_set_active(folio);
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) {
+ if (folio_test_workingset(folio))
+ folio_set_active(folio);
+ else if (!folio_test_referenced(folio))
+ folio_mark_accessed(folio);
+ }
folio_batch_add_and_move(folio, lru_add);
}
@@ -964,6 +1009,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
struct folio *folio = folios->folios[i];
unsigned int nr_refs = refs ? refs[i] : 1;
+ /* Folio batch entry may have been preemptively removed during drain. */
+ if (!folio)
+ continue;
+
if (is_huge_zero_folio(folio))
continue;
diff --git a/mm/swap.h b/mm/swap.h
index a77016f2423b..77d2d14eda42 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,11 +3,29 @@
#define _MM_SWAP_H
#include <linux/atomic.h> /* for atomic_long_t */
+#include <linux/mm.h> /* for PAGE_SHIFT */
struct mempolicy;
struct swap_iocb;
+struct swap_memcg_table;
extern int page_cluster;
+#if defined(MAX_POSSIBLE_PHYSMEM_BITS)
+#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
+#elif defined(MAX_PHYSMEM_BITS)
+#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#else
+#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT)
+#endif
+
+/* Swap table marker, 0x1 means shadow, 0x2 means PFN (SWP_TB_PFN_MARK) */
+#define SWAP_CACHE_PFN_MARK_BITS 2
+/* At least 2 bits are needed to distinguish SWP_TB_COUNT_MAX, 1 and 0 */
+#define SWAP_COUNT_MIN_BITS 2
+/* If there are enough bits besides PFN and marker, store zero flag inline */
+#define SWAP_TABLE_HAS_ZEROFLAG ((BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - \
+ SWAP_CACHE_PFN_BITS) > SWAP_COUNT_MIN_BITS)
+
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
#define swap_entry_order(order) (order)
@@ -38,6 +56,12 @@ struct swap_cluster_info {
u8 order;
atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
unsigned int *extend_table; /* For large swap count, protected by ci->lock */
+#ifdef CONFIG_MEMCG
+ struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */
+#endif
+#if !SWAP_TABLE_HAS_ZEROFLAG
+ unsigned long *zero_bitmap;
+#endif
struct list_head list;
};
@@ -280,9 +304,9 @@ bool swap_cache_has_folio(swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry);
void *swap_cache_get_shadow(swp_entry_t entry);
void swap_cache_del_folio(struct folio *folio);
-struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
- struct mempolicy *mpol, pgoff_t ilx,
- bool *alloced);
+struct folio *swap_cache_alloc_folio(swp_entry_t target_entry, gfp_t gfp_mask,
+ unsigned long orders, struct vm_fault *vmf,
+ struct mempolicy *mpol, pgoff_t ilx);
/* Below helpers require the caller to lock and pass in the swap cluster. */
void __swap_cache_add_folio(struct swap_cluster_info *ci,
struct folio *folio, swp_entry_t entry);
@@ -300,7 +324,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
+struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr);
@@ -309,49 +334,6 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
return __swap_entry_to_info(folio->swap)->flags;
}
-/*
- * Return the count of contiguous swap entries that share the same
- * zeromap status as the starting entry. If is_zeromap is not NULL,
- * it will return the zeromap status of the starting entry.
- */
-static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
- bool *is_zeromap)
-{
- struct swap_info_struct *sis = __swap_entry_to_info(entry);
- unsigned long start = swp_offset(entry);
- unsigned long end = start + max_nr;
- bool first_bit;
-
- first_bit = test_bit(start, sis->zeromap);
- if (is_zeromap)
- *is_zeromap = first_bit;
-
- if (max_nr <= 1)
- return max_nr;
- if (first_bit)
- return find_next_zero_bit(sis->zeromap, end, start) - start;
- else
- return find_next_bit(sis->zeromap, end, start) - start;
-}
-
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- int i;
-
- /*
- * While allocating a large folio and doing mTHP swapin, we need to
- * ensure all entries are not cached, otherwise, the mTHP folio will
- * be in conflict with the folio in swap cache.
- */
- for (i = 0; i < max_nr; i++) {
- if (swap_cache_has_folio(entry))
- return i;
- entry.val++;
- }
-
- return i;
-}
-
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline struct swap_cluster_info *swap_cluster_lock(
@@ -433,7 +415,9 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+static inline struct folio *swapin_sync(
+ swp_entry_t entry, gfp_t flag, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
{
return NULL;
}
@@ -488,15 +472,5 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
return 0;
}
-static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
- bool *has_zeromap)
-{
- return 0;
-}
-
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- return 0;
-}
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
deleted file mode 100644
index de779fed8c21..000000000000
--- a/mm/swap_cgroup.c
+++ /dev/null
@@ -1,172 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/swap_cgroup.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-#include <linux/swapops.h> /* depends on mm.h include */
-
-static DEFINE_MUTEX(swap_cgroup_mutex);
-
-/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
-#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short))
-#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
-#define ID_MASK (BIT(ID_SHIFT) - 1)
-struct swap_cgroup {
- atomic_t ids;
-};
-
-struct swap_cgroup_ctrl {
- struct swap_cgroup *map;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-
-static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
- pgoff_t offset)
-{
- unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
- unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);
-
- BUILD_BUG_ON(!is_power_of_2(ID_PER_SC));
- BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t));
-
- return (old_ids >> shift) & ID_MASK;
-}
-
-static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
- pgoff_t offset,
- unsigned short new_id)
-{
- unsigned short old_id;
- struct swap_cgroup *sc = &map[offset / ID_PER_SC];
- unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
- unsigned int new_ids, old_ids = atomic_read(&sc->ids);
-
- do {
- old_id = (old_ids >> shift) & ID_MASK;
- new_ids = (old_ids & ~(ID_MASK << shift));
- new_ids |= ((unsigned int)new_id) << shift;
- } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));
-
- return old_id;
-}
-
-/**
- * swap_cgroup_record - record mem_cgroup for a set of swap entries.
- * These entries must belong to one single folio, and that folio
- * must be being charged for swap space (swap out), and these
- * entries must not have been charged
- *
- * @folio: the folio that the swap entry belongs to
- * @id: mem_cgroup ID to be recorded
- * @ent: the first swap entry to be recorded
- */
-void swap_cgroup_record(struct folio *folio, unsigned short id,
- swp_entry_t ent)
-{
- unsigned int nr_ents = folio_nr_pages(folio);
- struct swap_cgroup *map;
- pgoff_t offset, end;
- unsigned short old;
-
- offset = swp_offset(ent);
- end = offset + nr_ents;
- map = swap_cgroup_ctrl[swp_type(ent)].map;
-
- do {
- old = __swap_cgroup_id_xchg(map, offset, id);
- VM_BUG_ON(old);
- } while (++offset != end);
-}
-
-/**
- * swap_cgroup_clear - clear mem_cgroup for a set of swap entries.
- * These entries must be being uncharged from swap. They either
- * belongs to one single folio in the swap cache (swap in for
- * cgroup v1), or no longer have any users (slot freeing).
- *
- * @ent: the first swap entry to be recorded into
- * @nr_ents: number of swap entries to be recorded
- *
- * Returns the existing old value.
- */
-unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
-{
- pgoff_t offset, end;
- struct swap_cgroup *map;
- unsigned short old, iter = 0;
-
- offset = swp_offset(ent);
- end = offset + nr_ents;
- map = swap_cgroup_ctrl[swp_type(ent)].map;
-
- do {
- old = __swap_cgroup_id_xchg(map, offset, 0);
- if (!iter)
- iter = old;
- VM_BUG_ON(iter != old);
- } while (++offset != end);
-
- return old;
-}
-
-/**
- * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
- * @ent: swap entry to be looked up.
- *
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
- */
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
- struct swap_cgroup_ctrl *ctrl;
-
- if (mem_cgroup_disabled())
- return 0;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
-}
-
-int swap_cgroup_swapon(int type, unsigned long max_pages)
-{
- struct swap_cgroup *map;
- struct swap_cgroup_ctrl *ctrl;
-
- if (mem_cgroup_disabled())
- return 0;
-
- BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
- sizeof(struct swap_cgroup));
- map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) *
- sizeof(struct swap_cgroup));
- if (!map)
- goto nomem;
-
- ctrl = &swap_cgroup_ctrl[type];
- mutex_lock(&swap_cgroup_mutex);
- ctrl->map = map;
- mutex_unlock(&swap_cgroup_mutex);
-
- return 0;
-nomem:
- pr_info("couldn't allocate enough memory for swap_cgroup\n");
- pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
- return -ENOMEM;
-}
-
-void swap_cgroup_swapoff(int type)
-{
- struct swap_cgroup *map;
- struct swap_cgroup_ctrl *ctrl;
-
- if (mem_cgroup_disabled())
- return;
-
- mutex_lock(&swap_cgroup_mutex);
- ctrl = &swap_cgroup_ctrl[type];
- map = ctrl->map;
- ctrl->map = NULL;
- mutex_unlock(&swap_cgroup_mutex);
-
- vfree(map);
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..9c3a5cf99778 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -137,8 +137,71 @@ void *swap_cache_get_shadow(swp_entry_t entry)
return NULL;
}
-void __swap_cache_add_folio(struct swap_cluster_info *ci,
- struct folio *folio, swp_entry_t entry)
+/**
+ * __swap_cache_add_check - Check if a range is suitable for adding a folio.
+ * @ci: The locked swap cluster
+ * @targ_entry: The target swap entry to check, will be rounded down by @nr
+ * @nr: Number of slots to check, must be a power of 2
+ * @shadowp: Returns the shadow value if one exists in the range
+ * @memcg_id: Returns the memory cgroup id, NULL to ignore cgroup check
+ *
+ * Check if all slots covered by given range have a swap count >= 1.
+ * Retrieves the shadow if there is one. If @memcg_id is not NULL, also
+ * checks if all slots belong to the same cgroup and return the cgroup
+ * private id.
+ *
+ * Context: Caller must lock the cluster.
+ * Return: 0 if success, error code if failed.
+ */
+static int __swap_cache_add_check(struct swap_cluster_info *ci,
+ swp_entry_t targ_entry,
+ unsigned long nr, void **shadowp,
+ unsigned short *memcg_id)
+{
+ unsigned int ci_off, ci_end;
+ unsigned long old_tb;
+ bool is_zero;
+
+ lockdep_assert_held(&ci->lock);
+
+ /*
+ * If the target slot is not swapped out or already cached, return
+ * -ENOENT or -EEXIST. If the batch is not suitable, could be a
+ * race with concurrent free or cache add, return -EBUSY.
+ */
+ if (unlikely(!ci->table))
+ return -ENOENT;
+ ci_off = swp_cluster_offset(targ_entry);
+ old_tb = __swap_table_get(ci, ci_off);
+ if (swp_tb_is_folio(old_tb))
+ return -EEXIST;
+ if (!__swp_tb_get_count(old_tb))
+ return -ENOENT;
+ if (shadowp && swp_tb_is_shadow(old_tb))
+ *shadowp = swp_tb_to_shadow(old_tb);
+ if (memcg_id)
+ *memcg_id = __swap_cgroup_get(ci, ci_off);
+
+ if (nr == 1)
+ return 0;
+
+ is_zero = __swap_table_test_zero(ci, ci_off);
+ ci_off = round_down(ci_off, nr);
+ ci_end = ci_off + nr;
+ do {
+ old_tb = __swap_table_get(ci, ci_off);
+ if (unlikely(swp_tb_is_folio(old_tb) ||
+ !__swp_tb_get_count(old_tb) ||
+ is_zero != __swap_table_test_zero(ci, ci_off) ||
+ (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off))))
+ return -EBUSY;
+ } while (++ci_off < ci_end);
+
+ return 0;
+}
+
+static void __swap_cache_do_add_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry)
{
unsigned int ci_off = swp_cluster_offset(entry), ci_end;
unsigned long nr_pages = folio_nr_pages(folio);
@@ -153,88 +216,42 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci,
do {
old_tb = __swap_table_get(ci, ci_off);
VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb));
- __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
+ __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb)));
} while (++ci_off < ci_end);
folio_ref_add(folio, nr_pages);
folio_set_swapcache(folio);
folio->swap = entry;
-
- node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
- lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
}
/**
- * swap_cache_add_folio - Add a folio into the swap cache.
+ * __swap_cache_add_folio - Add a folio to the swap cache and update stats.
+ * @ci: The locked swap cluster.
* @folio: The folio to be added.
* @entry: The swap entry corresponding to the folio.
- * @gfp: gfp_mask for XArray node allocation.
- * @shadowp: If a shadow is found, return the shadow.
*
- * Context: Caller must ensure @entry is valid and protect the swap device
- * with reference count or locks.
+ * Unconditionally add a folio to the swap cache. The caller must ensure
+ * all slots are usable and have no conflicts. This assigns entry to
+ * @folio->swap, increases folio refcount by the number of pages, and
+ * updates swap cache stats.
+ *
+ * Context: Caller must ensure the folio is locked and lock the cluster
+ * that holds the entries.
*/
-static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
- void **shadowp)
+void __swap_cache_add_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry)
{
- int err;
- void *shadow = NULL;
- unsigned long old_tb;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- unsigned int ci_start, ci_off, ci_end;
unsigned long nr_pages = folio_nr_pages(folio);
- si = __swap_entry_to_info(entry);
- ci_start = swp_cluster_offset(entry);
- ci_end = ci_start + nr_pages;
- ci_off = ci_start;
- ci = swap_cluster_lock(si, swp_offset(entry));
- if (unlikely(!ci->table)) {
- err = -ENOENT;
- goto failed;
- }
- do {
- old_tb = __swap_table_get(ci, ci_off);
- if (unlikely(swp_tb_is_folio(old_tb))) {
- err = -EEXIST;
- goto failed;
- }
- if (unlikely(!__swp_tb_get_count(old_tb))) {
- err = -ENOENT;
- goto failed;
- }
- if (swp_tb_is_shadow(old_tb))
- shadow = swp_tb_to_shadow(old_tb);
- } while (++ci_off < ci_end);
- __swap_cache_add_folio(ci, folio, entry);
- swap_cluster_unlock(ci);
- if (shadowp)
- *shadowp = shadow;
- return 0;
-
-failed:
- swap_cluster_unlock(ci);
- return err;
+ __swap_cache_do_add_folio(ci, folio, entry);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
}
-/**
- * __swap_cache_del_folio - Removes a folio from the swap cache.
- * @ci: The locked swap cluster.
- * @folio: The folio.
- * @entry: The first swap entry that the folio corresponds to.
- * @shadow: shadow value to be filled in the swap cache.
- *
- * Removes a folio from the swap cache and fills a shadow in place.
- * This won't put the folio's refcount. The caller has to do that.
- *
- * Context: Caller must ensure the folio is locked and in the swap cache
- * using the index of @entry, and lock the cluster that holds the entries.
- */
-void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
- swp_entry_t entry, void *shadow)
+static void __swap_cache_do_del_folio(struct swap_cluster_info *ci,
+ struct folio *folio,
+ swp_entry_t entry, void *shadow)
{
- int count;
unsigned long old_tb;
struct swap_info_struct *si;
unsigned int ci_start, ci_off, ci_end;
@@ -254,19 +271,17 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
old_tb = __swap_table_get(ci, ci_off);
WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
swp_tb_to_folio(old_tb) != folio);
- count = __swp_tb_get_count(old_tb);
- if (count)
+ if (__swp_tb_get_count(old_tb))
folio_swapped = true;
else
need_free = true;
- /* If shadow is NULL, we sets an empty shadow. */
- __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count));
+ /* If shadow is NULL, we set an empty shadow. */
+ __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow,
+ __swp_tb_get_flags(old_tb)));
} while (++ci_off < ci_end);
folio->swap.val = 0;
folio_clear_swapcache(folio);
- node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
- lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
if (!folio_swapped) {
__swap_cluster_free_entries(si, ci, ci_start, nr_pages);
@@ -280,6 +295,29 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
}
/**
+ * __swap_cache_del_folio - Removes a folio from the swap cache.
+ * @ci: The locked swap cluster.
+ * @folio: The folio.
+ * @entry: The first swap entry that the folio corresponds to.
+ * @shadow: shadow value to be filled in the swap cache.
+ *
+ * Removes a folio from the swap cache and fills a shadow in place.
+ * This won't put the folio's refcount. The caller has to do that.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache
+ * using the index of @entry, and lock the cluster that holds the entries.
+ */
+void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
+ swp_entry_t entry, void *shadow)
+{
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ __swap_cache_do_del_folio(ci, folio, entry, shadow);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
+}
+
+/**
* swap_cache_del_folio - Removes a folio from the swap cache.
* @folio: The folio.
*
@@ -333,7 +371,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
do {
old_tb = __swap_table_get(ci, ci_off);
WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
- __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
+ __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb)));
} while (++ci_off < ci_end);
/*
@@ -351,6 +389,153 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
}
/*
+ * Try to allocate a folio of given order in the swap cache.
+ *
+ * This helper resolves the potential races of swap allocation
+ * and prepares a folio to be used for swap IO. May return following
+ * value:
+ *
+ * -ENOMEM / -EBUSY: Order is too large or in conflict with sub slot,
+ * caller should shrink the order and retry
+ * -ENOENT / -EEXIST: Target swap entry is unavailable or cached, the caller
+ * should abort or try to use the cached folio instead
+ */
+static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci,
+ swp_entry_t targ_entry, gfp_t gfp,
+ unsigned int order, struct vm_fault *vmf,
+ struct mempolicy *mpol, pgoff_t ilx)
+{
+ int err;
+ swp_entry_t entry;
+ struct folio *folio;
+ void *shadow = NULL;
+ unsigned short memcg_id;
+ unsigned long address, nr_pages = 1UL << order;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+
+ VM_WARN_ON_ONCE(nr_pages > SWAPFILE_CLUSTER);
+ entry.val = round_down(targ_entry.val, nr_pages);
+
+ /* Check if the slot and range are available, skip allocation if not */
+ spin_lock(&ci->lock);
+ err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL, NULL);
+ spin_unlock(&ci->lock);
+ if (unlikely(err))
+ return ERR_PTR(err);
+
+ /*
+ * Limit THP gfp. The limitation is a no-op for typical
+ * GFP_HIGHUSER_MOVABLE but matters for shmem.
+ */
+ if (order)
+ gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+
+ if (mpol || !vmf) {
+ folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
+ } else {
+ address = round_down(vmf->address, PAGE_SIZE << order);
+ folio = vma_alloc_folio(gfp, order, vmf->vma, address);
+ }
+ if (unlikely(!folio))
+ return ERR_PTR(-ENOMEM);
+
+ /* Double check the range is still not in conflict */
+ spin_lock(&ci->lock);
+ err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow, &memcg_id);
+ if (unlikely(err)) {
+ spin_unlock(&ci->lock);
+ folio_put(folio);
+ return ERR_PTR(err);
+ }
+
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __swap_cache_do_add_folio(ci, folio, entry);
+ spin_unlock(&ci->lock);
+
+ if (mem_cgroup_swapin_charge_folio(folio, memcg_id,
+ vmf ? vmf->vma->vm_mm : NULL, gfp)) {
+ spin_lock(&ci->lock);
+ __swap_cache_do_del_folio(ci, folio, entry, shadow);
+ spin_unlock(&ci->lock);
+ folio_unlock(folio);
+ /* nr_pages refs from swap cache, 1 from allocation */
+ folio_put_refs(folio, nr_pages + 1);
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (order > 1 && folio_memcg_alloc_deferred(folio)) {
+ spin_lock(&ci->lock);
+ __swap_cache_do_del_folio(ci, folio, entry, shadow);
+ spin_unlock(&ci->lock);
+ folio_unlock(folio);
+ /* nr_pages refs from swap cache, 1 from allocation */
+ folio_put_refs(folio, nr_pages + 1);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* memsw uncharges swap when folio is added to swap cache */
+ memcg1_swapin(folio);
+ if (shadow)
+ workingset_refault(folio, shadow);
+
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
+
+ /* Caller will initiate read into locked new_folio */
+ folio_add_lru(folio);
+ return folio;
+}
+
+/**
+ * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
+ * @targ_entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders, must be non zero
+ * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
+ *
+ * Allocate a folio in the swap cache for one swap slot, typically before
+ * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
+ * @targ_entry must have a non-zero swap count (swapped out).
+ *
+ * Context: Caller must protect the swap device with reference count or locks.
+ * Return: Returns the folio if allocation succeeded and folio is in the swap
+ * cache. Returns error code if failed due to race, OOM or invalid arguments.
+ */
+struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp,
+ unsigned long orders, struct vm_fault *vmf,
+ struct mempolicy *mpol, pgoff_t ilx)
+{
+ int order, err;
+ struct folio *ret;
+ struct swap_cluster_info *ci;
+
+ ci = __swap_entry_to_cluster(targ_entry);
+ order = highest_order(orders);
+
+ /* orders must be non-zero, and must not exceed cluster size. */
+ if (WARN_ON_ONCE(!orders || (1UL << order) > SWAPFILE_CLUSTER))
+ return ERR_PTR(-EINVAL);
+
+ do {
+ ret = __swap_cache_alloc(ci, targ_entry, gfp, order,
+ vmf, mpol, ilx);
+ if (!IS_ERR(ret))
+ break;
+ err = PTR_ERR(ret);
+ if (!order || (err && err != -EBUSY && err != -ENOMEM))
+ break;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
+ order = next_order(&orders, order);
+ } while (orders);
+
+ return ret;
+}
+
+/*
* If we are the only user, then try to free up the swap cache.
*
* Its ok to check the swapcache flag without the folio lock
@@ -448,140 +633,64 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
}
}
-/**
- * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache.
- * @entry: swap entry to be bound to the folio.
- * @folio: folio to be added.
- * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
- * @charged: if the folio is already charged.
- *
- * Update the swap_map and add folio as swap cache, typically before swapin.
- * All swap slots covered by the folio must have a non-zero swap count.
- *
- * Context: Caller must protect the swap device with reference count or locks.
- * Return: Returns the folio being added on success. Returns the existing folio
- * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
- */
-static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
- struct folio *folio,
- gfp_t gfp, bool charged)
+static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp,
+ struct mempolicy *mpol, pgoff_t ilx,
+ struct swap_iocb **plug, bool readahead)
{
- struct folio *swapcache = NULL;
- void *shadow;
- int ret;
-
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry))
- goto failed;
+ struct folio *folio;
- for (;;) {
- ret = swap_cache_add_folio(folio, entry, &shadow);
- if (!ret)
- break;
+ do {
+ folio = swap_cache_get_folio(entry);
+ if (folio)
+ return folio;
+ folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx);
+ } while (PTR_ERR(folio) == -EEXIST);
- /*
- * Large order allocation needs special handling on
- * race: if a smaller folio exists in cache, swapin needs
- * to fallback to order 0, and doing a swap cache lookup
- * might return a folio that is irrelevant to the faulting
- * entry because @entry is aligned down. Just return NULL.
- */
- if (ret != -EEXIST || folio_test_large(folio))
- goto failed;
+ if (IS_ERR_OR_NULL(folio))
+ return NULL;
- swapcache = swap_cache_get_folio(entry);
- if (swapcache)
- goto failed;
+ swap_read_folio(folio, plug);
+ if (readahead) {
+ folio_set_readahead(folio);
+ count_vm_event(SWAP_RA);
}
- memcg1_swapin(entry, folio_nr_pages(folio));
- if (shadow)
- workingset_refault(folio, shadow);
-
- /* Caller will initiate read into locked folio */
- folio_add_lru(folio);
return folio;
-
-failed:
- folio_unlock(folio);
- return swapcache;
}
/**
- * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
- * @entry: the swapped out swap entry to be binded to the folio.
- * @gfp_mask: memory allocation flags
+ * swapin_sync - swap-in one or multiple entries skipping readahead.
+ * @entry: swap entry indicating the target slot
+ * @gfp: memory allocation flags
+ * @orders: allocation orders
+ * @vmf: fault information
* @mpol: NUMA memory allocation policy to be applied
* @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
- * @new_page_allocated: sets true if allocation happened, false otherwise
*
- * Allocate a folio in the swap cache for one swap slot, typically before
- * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
- * @entry must have a non-zero swap count (swapped out).
- * Currently only supports order 0.
+ * This allocates a folio suitable for given @orders, or returns the
+ * existing folio in the swap cache for @entry. This initiates the IO, too,
+ * if needed. @entry is rounded down if @orders allow large allocation.
*
- * Context: Caller must protect the swap device with reference count or locks.
- * Return: Returns the existing folio if @entry is cached already. Returns
- * NULL if failed due to -ENOMEM or @entry have a swap count < 1.
+ * Context: Caller must ensure @entry is valid and pin the swap device with refcount.
+ * Return: Returns the folio on success, error code if failed.
*/
-struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
- struct mempolicy *mpol, pgoff_t ilx,
- bool *new_page_allocated)
+struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders,
+ struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx)
{
- struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
- struct folio *result = NULL;
- *new_page_allocated = false;
- /* Check the swap cache again for readahead path. */
- folio = swap_cache_get_folio(entry);
- if (folio)
- return folio;
-
- /* Skip allocation for unused and bad swap slot for readahead. */
- if (!swap_entry_swapped(si, entry))
- return NULL;
-
- /* Allocate a new folio to be added into the swap cache. */
- folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
- if (!folio)
- return NULL;
- /* Try add the new folio, returns existing folio or NULL on failure. */
- result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
- if (result == folio)
- *new_page_allocated = true;
- else
- folio_put(folio);
- return result;
-}
+ do {
+ folio = swap_cache_get_folio(entry);
+ if (folio)
+ return folio;
+ folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx);
+ } while (PTR_ERR(folio) == -EEXIST);
-/**
- * swapin_folio - swap-in one or multiple entries skipping readahead.
- * @entry: starting swap entry to swap in
- * @folio: a new allocated and charged folio
- *
- * Reads @entry into @folio, @folio will be added to the swap cache.
- * If @folio is a large folio, the @entry will be rounded down to align
- * with the folio size.
- *
- * Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
- */
-struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
-{
- struct folio *swapcache;
- pgoff_t offset = swp_offset(entry);
- unsigned long nr_pages = folio_nr_pages(folio);
+ if (IS_ERR(folio))
+ return folio;
- entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
- swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
- if (swapcache == folio)
- swap_read_folio(folio, NULL);
- return swapcache;
+ swap_read_folio(folio, NULL);
+ return folio;
}
/*
@@ -595,7 +704,6 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct swap_iocb **plug)
{
struct swap_info_struct *si;
- bool page_allocated;
struct mempolicy *mpol;
pgoff_t ilx;
struct folio *folio;
@@ -605,13 +713,9 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return NULL;
mpol = get_vma_policy(vma, addr, 0, &ilx);
- folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated);
+ folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx, plug, false);
mpol_cond_put(mpol);
- if (page_allocated)
- swap_read_folio(folio, plug);
-
put_swap_device(si);
return folio;
}
@@ -696,7 +800,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* are fairly likely to have been swapped out from the same node.
*/
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct mempolicy *mpol, pgoff_t ilx)
+ struct mempolicy *mpol, pgoff_t ilx)
{
struct folio *folio;
unsigned long entry_offset = swp_offset(entry);
@@ -706,7 +810,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct swap_info_struct *si = __swap_entry_to_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
- bool page_allocated;
+ swp_entry_t ra_entry;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -723,18 +827,11 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
- folio = swap_cache_alloc_folio(
- swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx,
- &page_allocated);
+ ra_entry = swp_entry(swp_type(entry), offset);
+ folio = swap_cache_read_folio(ra_entry, gfp_mask, mpol, ilx,
+ &splug, offset != entry_offset);
if (!folio)
continue;
- if (page_allocated) {
- swap_read_folio(folio, &splug);
- if (offset != entry_offset) {
- folio_set_readahead(folio);
- count_vm_event(SWAP_RA);
- }
- }
folio_put(folio);
}
blk_finish_plug(&plug);
@@ -742,11 +839,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated);
- if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
- return folio;
+ return swap_cache_read_folio(entry, gfp_mask, mpol, ilx, NULL, false);
}
static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
@@ -812,8 +905,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
pte_t *pte = NULL, pentry;
int win;
unsigned long start, end, addr;
- pgoff_t ilx;
- bool page_allocated;
+ pgoff_t ilx = targ_ilx;
win = swap_vma_ra_win(vmf, &start, &end);
if (win == 1)
@@ -847,19 +939,12 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!si)
continue;
}
- folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
- &page_allocated);
+ folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx,
+ &splug, addr != vmf->address);
if (si)
put_swap_device(si);
if (!folio)
continue;
- if (page_allocated) {
- swap_read_folio(folio, &splug);
- if (addr != vmf->address) {
- folio_set_readahead(folio);
- count_vm_event(SWAP_RA);
- }
- }
folio_put(folio);
}
if (pte)
@@ -869,10 +954,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
lru_add_drain();
skip:
/* The folio was likely read above, so no need for plugging here */
- folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
- &page_allocated);
- if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
+ folio = swap_cache_read_folio(targ_entry, gfp_mask, mpol, targ_ilx,
+ NULL, false);
return folio;
}
diff --git a/mm/swap_table.h b/mm/swap_table.h
index 8415ffbe2b9c..e6613e62f8d0 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -11,6 +11,11 @@ struct swap_table {
atomic_long_t entries[SWAPFILE_CLUSTER];
};
+/* For storing memcg private id */
+struct swap_memcg_table {
+ unsigned short id[SWAPFILE_CLUSTER];
+};
+
#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
/*
@@ -21,12 +26,14 @@ struct swap_table {
* Swap table entry type and bits layouts:
*
* NULL: |---------------- 0 ---------------| - Free slot
- * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot
- * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot
+ * Shadow: |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot
+ * PFN: |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot
* Pointer: |----------- Pointer ----------|100| - (Unused)
* Bad: |------------- 1 -------------|1000| - Bad slot
*
- * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long.
+ * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_FLAG` bit,
+ * and together they form the `SWP_TB_FLAGS_BITS` wide flags field.
+ * Each entry is an atomic long.
*
* Usages:
*
@@ -49,14 +56,6 @@ struct swap_table {
* - Bad: Swap slot is reserved, protects swap header or holes on swap devices.
*/
-#if defined(MAX_POSSIBLE_PHYSMEM_BITS)
-#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
-#elif defined(MAX_PHYSMEM_BITS)
-#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#else
-#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT)
-#endif
-
/* NULL Entry, all 0 */
#define SWP_TB_NULL 0UL
@@ -64,22 +63,26 @@ struct swap_table {
#define SWP_TB_SHADOW_MARK 0b1UL
/* Cached: PFN */
-#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS)
+#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWAP_CACHE_PFN_MARK_BITS)
#define SWP_TB_PFN_MARK 0b10UL
-#define SWP_TB_PFN_MARK_BITS 2
-#define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1)
+#define SWP_TB_PFN_MARK_MASK (BIT(SWAP_CACHE_PFN_MARK_BITS) - 1)
-/* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */
-#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS)
+/* Flags: For PFN or shadow, contains SWAP_COUNT, width changes */
+#define SWP_TB_FLAGS_BITS min(5, BITS_PER_LONG - SWP_TB_PFN_BITS)
+#define SWP_TB_COUNT_BITS (SWP_TB_FLAGS_BITS - SWAP_TABLE_HAS_ZEROFLAG)
+#define SWP_TB_FLAGS_MASK (~((~0UL) >> SWP_TB_FLAGS_BITS))
#define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS))
+#define SWP_TB_FLAGS_SHIFT (BITS_PER_LONG - SWP_TB_FLAGS_BITS)
#define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS)
#define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1)
+/* The first flag is zero bit (SWAP_TABLE_HAS_ZEROFLAG) */
+#define SWP_TB_ZERO_FLAG BIT(BITS_PER_LONG - SWP_TB_FLAGS_BITS)
/* Bad slot: ends with 0b1000 and rests of bits are all 1 */
#define SWP_TB_BAD ((~0UL) << 3)
/* Macro for shadow offset calculation */
-#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS
+#define SWAP_COUNT_SHIFT SWP_TB_FLAGS_BITS
/*
* Helpers for casting one type of info into a swap table entry.
@@ -97,40 +100,47 @@ static inline unsigned long __count_to_swp_tb(unsigned char count)
* used (count > 0 && count < SWP_TB_COUNT_MAX), and
* overflow (count == SWP_TB_COUNT_MAX).
*/
- BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2);
+ BUILD_BUG_ON(SWP_TB_COUNT_BITS < SWAP_COUNT_MIN_BITS);
VM_WARN_ON(count > SWP_TB_COUNT_MAX);
return ((unsigned long)count) << SWP_TB_COUNT_SHIFT;
}
-static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count)
+static inline unsigned long __flags_to_swp_tb(unsigned char flags)
+{
+ BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE);
+ VM_WARN_ON(flags >> SWP_TB_FLAGS_BITS);
+ return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT;
+}
+
+static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char flags)
{
unsigned long swp_tb;
BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *));
BUILD_BUG_ON(SWAP_CACHE_PFN_BITS >
- (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS));
+ (BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - SWP_TB_FLAGS_BITS));
- swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK;
- VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK);
+ swp_tb = (pfn << SWAP_CACHE_PFN_MARK_BITS) | SWP_TB_PFN_MARK;
+ VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK);
- return swp_tb | __count_to_swp_tb(count);
+ return swp_tb | __flags_to_swp_tb(flags);
}
-static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count)
+static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned char flags)
{
- return pfn_to_swp_tb(folio_pfn(folio), count);
+ return pfn_to_swp_tb(folio_pfn(folio), flags);
}
-static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count)
+static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char flags)
{
BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) !=
BITS_PER_BYTE * sizeof(unsigned long));
BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK);
VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow));
- VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK));
+ VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK));
- return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK;
+ return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(flags);
}
/*
@@ -168,14 +178,14 @@ static inline bool swp_tb_is_countable(unsigned long swp_tb)
static inline struct folio *swp_tb_to_folio(unsigned long swp_tb)
{
VM_WARN_ON(!swp_tb_is_folio(swp_tb));
- return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS);
+ return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWAP_CACHE_PFN_MARK_BITS);
}
static inline void *swp_tb_to_shadow(unsigned long swp_tb)
{
VM_WARN_ON(!swp_tb_is_shadow(swp_tb));
/* No shift needed, xa_value is stored as it is in the lower bits. */
- return (void *)(swp_tb & ~SWP_TB_COUNT_MASK);
+ return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK);
}
static inline unsigned char __swp_tb_get_count(unsigned long swp_tb)
@@ -184,6 +194,12 @@ static inline unsigned char __swp_tb_get_count(unsigned long swp_tb)
return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT);
}
+static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb)
+{
+ VM_WARN_ON(!swp_tb_is_countable(swp_tb));
+ return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT);
+}
+
static inline int swp_tb_get_count(unsigned long swp_tb)
{
if (swp_tb_is_countable(swp_tb))
@@ -247,4 +263,107 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
return swp_tb;
}
+
+static inline void __swap_table_set_zero(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+#if SWAP_TABLE_HAS_ZEROFLAG
+ unsigned long swp_tb = __swap_table_get(ci, ci_off);
+
+ BUILD_BUG_ON(SWP_TB_ZERO_FLAG & ~SWP_TB_FLAGS_MASK);
+ VM_WARN_ON(!swp_tb_is_countable(swp_tb));
+ swp_tb |= SWP_TB_ZERO_FLAG;
+ __swap_table_set(ci, ci_off, swp_tb);
+#else
+ lockdep_assert_held(&ci->lock);
+ __set_bit(ci_off, ci->zero_bitmap);
+#endif
+}
+
+static inline bool __swap_table_test_zero(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+#if SWAP_TABLE_HAS_ZEROFLAG
+ unsigned long swp_tb = __swap_table_get(ci, ci_off);
+
+ VM_WARN_ON(!swp_tb_is_countable(swp_tb));
+ return !!(swp_tb & SWP_TB_ZERO_FLAG);
+#else
+ return test_bit(ci_off, ci->zero_bitmap);
+#endif
+}
+
+static inline void __swap_table_clear_zero(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+#if SWAP_TABLE_HAS_ZEROFLAG
+ unsigned long swp_tb = __swap_table_get(ci, ci_off);
+
+ VM_WARN_ON(!swp_tb_is_countable(swp_tb));
+ swp_tb &= ~SWP_TB_ZERO_FLAG;
+ __swap_table_set(ci, ci_off, swp_tb);
+#else
+ lockdep_assert_held(&ci->lock);
+ __clear_bit(ci_off, ci->zero_bitmap);
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+ unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+ if (WARN_ON_ONCE(!ci->memcg_table))
+ return;
+ do {
+ ci->memcg_table->id[ci_off++] = id;
+ } while (--nr);
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+ if (unlikely(!ci->memcg_table))
+ return 0;
+ return ci->memcg_table->id[ci_off];
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+ unsigned int ci_off,
+ unsigned long nr)
+{
+ unsigned short old = __swap_cgroup_get(ci, ci_off);
+
+ if (!old)
+ return 0;
+ do {
+ VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old);
+ ci->memcg_table->id[ci_off++] = 0;
+ } while (--nr);
+
+ return old;
+}
+#else
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+ unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ return 0;
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+ unsigned int ci_off,
+ unsigned long nr)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9174f1eeffb0..78b49b0658ad 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,6 @@
#include <asm/tlbflush.h>
#include <linux/leafops.h>
-#include <linux/swap_cgroup.h>
#include "swap_table.h"
#include "internal.h"
#include "swap.h"
@@ -133,7 +132,7 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
/* May return NULL on invalid type, caller must check for NULL return */
static struct swap_info_struct *swap_type_to_info(int type)
{
- if (type >= MAX_SWAPFILES)
+ if (type < 0 || type >= MAX_SWAPFILES)
return NULL;
return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
@@ -411,20 +410,7 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
-static struct swap_table *swap_table_alloc(gfp_t gfp)
-{
- struct folio *folio;
-
- if (!SWP_TABLE_USE_PAGE)
- return kmem_cache_zalloc(swap_table_cachep, gfp);
-
- folio = folio_alloc(gfp | __GFP_ZERO, 0);
- if (folio)
- return folio_address(folio);
- return NULL;
-}
-
-static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
+static void swap_cluster_free_table_folio_rcu_cb(struct rcu_head *head)
{
struct folio *folio;
@@ -432,15 +418,76 @@ static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
folio_put(folio);
}
-static void swap_table_free(struct swap_table *table)
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
{
+ struct swap_table *table;
+
+#ifdef CONFIG_MEMCG
+ kfree(ci->memcg_table);
+ ci->memcg_table = NULL;
+#endif
+
+#if !SWAP_TABLE_HAS_ZEROFLAG
+ kfree(ci->zero_bitmap);
+ ci->zero_bitmap = NULL;
+#endif
+
+ table = (struct swap_table *)rcu_access_pointer(ci->table);
+ if (!table)
+ return;
+
+ rcu_assign_pointer(ci->table, NULL);
if (!SWP_TABLE_USE_PAGE) {
kmem_cache_free(swap_table_cachep, table);
return;
}
call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
- swap_table_free_folio_rcu_cb);
+ swap_cluster_free_table_folio_rcu_cb);
+}
+
+static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
+{
+ struct swap_table *table = NULL;
+ struct folio *folio;
+
+ /* The cluster must be empty and not on any list during allocation. */
+ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
+ if (rcu_access_pointer(ci->table))
+ return 0;
+
+ if (SWP_TABLE_USE_PAGE) {
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (folio)
+ table = folio_address(folio);
+ } else {
+ table = kmem_cache_zalloc(swap_table_cachep, gfp);
+ }
+ if (!table)
+ return -ENOMEM;
+
+ rcu_assign_pointer(ci->table, table);
+
+#ifdef CONFIG_MEMCG
+ if (!mem_cgroup_disabled()) {
+ VM_WARN_ON_ONCE(ci->memcg_table);
+ ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp);
+ if (!ci->memcg_table) {
+ swap_cluster_free_table(ci);
+ return -ENOMEM;
+ }
+ }
+#endif
+
+#if !SWAP_TABLE_HAS_ZEROFLAG
+ VM_WARN_ON_ONCE(ci->zero_bitmap);
+ ci->zero_bitmap = bitmap_zalloc(SWAPFILE_CLUSTER, gfp);
+ if (!ci->zero_bitmap) {
+ swap_cluster_free_table(ci);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
}
/*
@@ -465,33 +512,22 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
bad_slots++;
else
WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off));
} while (++ci_off < ci_end);
WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table);
}
-static void swap_cluster_free_table(struct swap_cluster_info *ci)
-{
- struct swap_table *table;
-
- /* Only empty cluster's table is allow to be freed */
- lockdep_assert_held(&ci->lock);
- table = (void *)rcu_dereference_protected(ci->table, true);
- rcu_assign_pointer(ci->table, NULL);
-
- swap_table_free(table);
-}
-
/*
* Allocate swap table for one cluster. Attempt an atomic allocation first,
* then fallback to sleeping allocation.
*/
static struct swap_cluster_info *
-swap_cluster_alloc_table(struct swap_info_struct *si,
+swap_cluster_populate(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
- struct swap_table *table;
+ int ret;
/*
* Only cluster isolation from the allocator does table allocation.
@@ -502,14 +538,9 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
lockdep_assert_held(&si->global_cluster_lock);
lockdep_assert_held(&ci->lock);
- /* The cluster must be free and was just isolated from the free list. */
- VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
-
- table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (table) {
- rcu_assign_pointer(ci->table, table);
+ if (!swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC |
+ __GFP_NOWARN))
return ci;
- }
/*
* Try a sleep allocation. Each isolated free cluster may cause
@@ -521,7 +552,8 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
spin_unlock(&si->global_cluster_lock);
local_unlock(&percpu_swap_cluster.lock);
- table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
+ ret = swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC |
+ GFP_KERNEL);
/*
* Back to atomic context. We might have migrated to a new CPU with a
@@ -536,20 +568,11 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
spin_lock(&si->global_cluster_lock);
spin_lock(&ci->lock);
- /* Nothing except this helper should touch a dangling empty cluster. */
- if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
- if (table)
- swap_table_free(table);
- return ci;
- }
-
- if (!table) {
+ if (ret) {
move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
spin_unlock(&ci->lock);
return NULL;
}
-
- rcu_assign_pointer(ci->table, table);
return ci;
}
@@ -621,12 +644,11 @@ static struct swap_cluster_info *isolate_lock_cluster(
}
spin_unlock(&si->lock);
- if (found && !cluster_table_is_alloced(found)) {
- /* Only an empty free cluster's swap table can be freed. */
- VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE);
+ /* Cluster's table is freed when and only when it's on the free list. */
+ if (found && flags == CLUSTER_FLAG_FREE) {
VM_WARN_ON_ONCE(list != &si->free_clusters);
- VM_WARN_ON_ONCE(!cluster_is_empty(found));
- return swap_cluster_alloc_table(si, found);
+ VM_WARN_ON_ONCE(cluster_table_is_alloced(found));
+ return swap_cluster_populate(si, found);
}
return found;
@@ -769,7 +791,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
unsigned int ci_off = offset % SWAPFILE_CLUSTER;
unsigned long idx = offset / SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
- struct swap_table *table;
int ret = 0;
/* si->max may got shrunk by swap swap_activate() */
@@ -790,12 +811,9 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
}
ci = cluster_info + idx;
- if (!ci->table) {
- table = swap_table_alloc(GFP_KERNEL);
- if (!table)
- return -ENOMEM;
- rcu_assign_pointer(ci->table, table);
- }
+ /* Need to allocate swap table first for initial bad slot marking. */
+ if (!ci->count && swap_cluster_alloc_table(ci, GFP_KERNEL))
+ return -ENOMEM;
spin_lock(&ci->lock);
/* Check for duplicated bad swap slots. */
if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) {
@@ -922,8 +940,8 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
order = 0;
nr_pages = 1;
swap_cluster_assert_empty(ci, ci_off, 1, false);
- /* Sets a fake shadow as placeholder */
- __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1));
+ /* Fake shadow placeholder with no flag, hibernation does not use the zeromap */
+ __swap_table_set(ci, ci_off, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0), 1));
} else {
/* Allocation without folio is only possible with hibernation */
WARN_ON_ONCE(1);
@@ -1054,6 +1072,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
+ cond_resched();
}
}
@@ -1295,14 +1314,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
void (*swap_slot_free_notify)(struct block_device *, unsigned long);
unsigned int i;
- /*
- * Use atomic clear_bit operations only on zeromap instead of non-atomic
- * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
- */
- for (i = 0; i < nr_entries; i++) {
- clear_bit(offset + i, si->zeromap);
+ for (i = 0; i < nr_entries; i++)
zswap_invalidate(swp_entry(si->type, offset + i));
- }
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
@@ -1442,8 +1455,10 @@ start_over:
}
static int swap_extend_table_alloc(struct swap_info_struct *si,
- struct swap_cluster_info *ci, gfp_t gfp)
+ struct swap_cluster_info *ci,
+ unsigned int ci_off, gfp_t gfp)
{
+ int count;
void *table;
table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
@@ -1451,11 +1466,27 @@ static int swap_extend_table_alloc(struct swap_info_struct *si,
return -ENOMEM;
spin_lock(&ci->lock);
- if (!ci->extend_table)
- ci->extend_table = table;
- else
- kfree(table);
+ /*
+ * Extend table allocation requires releasing ci lock first so it's
+ * possible that the slot has been freed, no longer overflowed, or
+ * a concurrent extend table allocation has already succeeded, so
+ * the allocation is no longer needed.
+ */
+ if (!cluster_table_is_alloced(ci))
+ goto out_free;
+ count = swp_tb_get_count(__swap_table_get(ci, ci_off));
+ if (count < (SWP_TB_COUNT_MAX - 1))
+ goto out_free;
+ if (ci->extend_table)
+ goto out_free;
+
+ ci->extend_table = table;
+ spin_unlock(&ci->lock);
+ return 0;
+
+out_free:
spin_unlock(&ci->lock);
+ kfree(table);
return 0;
}
@@ -1471,7 +1502,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
return 0;
ci = __swap_offset_to_cluster(si, offset);
- ret = swap_extend_table_alloc(si, ci, gfp);
+ ret = swap_extend_table_alloc(si, ci, swp_cluster_offset(entry), gfp);
put_swap_device(si);
return ret;
@@ -1518,13 +1549,21 @@ static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
if (count == (SWP_TB_COUNT_MAX - 1)) {
ci->extend_table[ci_off] = 0;
__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
- swap_extend_table_try_free(ci);
} else {
ci->extend_table[ci_off] = count;
}
} else {
__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
}
+
+ /*
+ * `SWP_TB_COUNT_MAX - 1` triggers extend table allocation. If the
+ * count was above that, then the extend table is no longer needed,
+ * so free it. And if we just put the count value from MAX - 1, it's
+ * also possible that a pending dup just attached an extend table.
+ */
+ if (unlikely(count == SWP_TB_COUNT_MAX - 2 || count == SWP_TB_COUNT_MAX - 1))
+ swap_extend_table_try_free(ci);
}
/**
@@ -1664,7 +1703,7 @@ restart:
if (unlikely(err)) {
if (err == -ENOMEM) {
spin_unlock(&ci->lock);
- err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
+ err = swap_extend_table_alloc(si, ci, ci_off, GFP_ATOMIC);
spin_lock(&ci->lock);
if (!err)
goto restart;
@@ -1730,7 +1769,7 @@ again:
}
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
- if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap)))
+ if (unlikely(mem_cgroup_try_charge_swap(folio)))
swap_cache_del_folio(folio);
if (unlikely(!folio_test_swapcache(folio)))
@@ -1826,8 +1865,7 @@ void folio_put_swap(struct folio *folio, struct page *subpage)
* do_swap_page()
* ... swapoff+swapon
* swap_cache_alloc_folio()
- * swap_cache_add_folio()
- * // check swap_map
+ * // check swap_map
* // verify PTE not changed
*
* In __swap_duplicate(), the swap_map need to be checked before
@@ -1873,21 +1911,44 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
unsigned int ci_start, unsigned int nr_pages)
{
unsigned long old_tb;
+ unsigned short batch_id = 0, id_cur;
unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
- unsigned long offset = cluster_offset(si, ci) + ci_start;
+ unsigned long ci_head = cluster_offset(si, ci);
+ unsigned int batch_off = ci_off;
VM_WARN_ON(ci->count < nr_pages);
ci->count -= nr_pages;
do {
old_tb = __swap_table_get(ci, ci_off);
- /* Release the last ref, or after swap cache is dropped */
+ /*
+ * Freeing is done after release of the last swap count
+ * ref, or after swap cache is dropped
+ */
VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1);
+
+ /* Resetting the slot to NULL also clears the inline flags. */
__swap_table_set(ci, ci_off, null_to_swp_tb());
+ if (!SWAP_TABLE_HAS_ZEROFLAG)
+ __swap_table_clear_zero(ci, ci_off);
+
+ /*
+ * Uncharge swap slots by memcg in batches. Consecutive
+ * slots with the same cgroup id are uncharged together.
+ */
+ id_cur = __swap_cgroup_clear(ci, ci_off, 1);
+ if (batch_id != id_cur) {
+ if (batch_id)
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+ batch_id = id_cur;
+ batch_off = ci_off;
+ }
} while (++ci_off < ci_end);
- mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages);
- swap_range_free(si, offset, nr_pages);
+ if (batch_id)
+ mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off);
+
+ swap_range_free(si, ci_head + ci_start, nr_pages);
swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
if (!ci->count)
@@ -2077,7 +2138,16 @@ out:
}
#ifdef CONFIG_HIBERNATION
-/* Allocate a slot for hibernation */
+/**
+ * swap_alloc_hibernation_slot() - Allocate a swap slot for hibernation.
+ * @type: swap device type index to allocate from.
+ *
+ * The caller must ensure the swap device is stable, either by pinning
+ * it (SWP_HIBERNATION) or by freezing user-space.
+ *
+ * Return: a valid swp_entry_t on success, or an empty entry (val == 0)
+ * on failure.
+ */
swp_entry_t swap_alloc_hibernation_slot(int type)
{
struct swap_info_struct *pcp_si, *si = swap_type_to_info(type);
@@ -2088,46 +2158,42 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
if (!si)
goto fail;
- /* This is called for allocating swap entry, not cache */
- if (get_swap_device_info(si)) {
- if (si->flags & SWP_WRITEOK) {
- /*
- * Try the local cluster first if it matches the device. If
- * not, try grab a new cluster and override local cluster.
- */
- local_lock(&percpu_swap_cluster.lock);
- pcp_si = this_cpu_read(percpu_swap_cluster.si[0]);
- pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]);
- if (pcp_si == si && pcp_offset) {
- ci = swap_cluster_lock(si, pcp_offset);
- if (cluster_is_usable(ci, 0))
- offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset);
- else
- swap_cluster_unlock(ci);
- }
- if (!offset)
- offset = cluster_alloc_swap_entry(si, NULL);
- local_unlock(&percpu_swap_cluster.lock);
- if (offset)
- entry = swp_entry(si->type, offset);
- }
- put_swap_device(si);
+ /*
+ * Try the local cluster first if it matches the device. If
+ * not, try grab a new cluster and override local cluster.
+ */
+ local_lock(&percpu_swap_cluster.lock);
+ pcp_si = this_cpu_read(percpu_swap_cluster.si[0]);
+ pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]);
+ if (pcp_si == si && pcp_offset) {
+ ci = swap_cluster_lock(si, pcp_offset);
+ if (cluster_is_usable(ci, 0))
+ offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset);
+ else
+ swap_cluster_unlock(ci);
}
+ if (!offset)
+ offset = cluster_alloc_swap_entry(si, NULL);
+ local_unlock(&percpu_swap_cluster.lock);
+ if (offset)
+ entry = swp_entry(si->type, offset);
+
fail:
return entry;
}
-/* Free a slot allocated by swap_alloc_hibernation_slot */
+/**
+ * swap_free_hibernation_slot() - Free a swap slot allocated for hibernation.
+ * @entry: swap entry to free.
+ *
+ * The caller must ensure the swap device is stable.
+ */
void swap_free_hibernation_slot(swp_entry_t entry)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct swap_cluster_info *ci;
pgoff_t offset = swp_offset(entry);
- si = get_swap_device(entry);
- if (WARN_ON(!si))
- return;
-
ci = swap_cluster_lock(si, offset);
__swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER);
__swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1);
@@ -2135,25 +2201,17 @@ void swap_free_hibernation_slot(swp_entry_t entry)
/* In theory readahead might add it to the swap cache by accident */
__try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
- put_swap_device(si);
}
-/*
- * Find the swap type that corresponds to given device (if any).
- *
- * @offset - number of the PAGE_SIZE-sized block of the device, starting
- * from 0, in which the swap header is expected to be located.
- *
- * This is needed for the suspend to disk (aka swsusp).
- */
-int swap_type_of(dev_t device, sector_t offset)
+static int __find_hibernation_swap_type(dev_t device, sector_t offset)
{
int type;
+ lockdep_assert_held(&swap_lock);
+
if (!device)
- return -1;
+ return -EINVAL;
- spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
struct swap_info_struct *sis = swap_info[type];
@@ -2163,16 +2221,118 @@ int swap_type_of(dev_t device, sector_t offset)
if (device == sis->bdev->bd_dev) {
struct swap_extent *se = first_se(sis);
- if (se->start_block == offset) {
- spin_unlock(&swap_lock);
+ if (se->start_block == offset)
return type;
- }
}
}
- spin_unlock(&swap_lock);
return -ENODEV;
}
+/**
+ * pin_hibernation_swap_type - Pin the swap device for hibernation
+ * @device: Block device containing the resume image
+ * @offset: Offset identifying the swap area
+ *
+ * Locate the swap device for @device/@offset and mark it as pinned
+ * for hibernation. While pinned, swapoff() is prevented.
+ *
+ * Only one uswsusp context may pin a swap device at a time.
+ * If already pinned, this function returns -EBUSY.
+ *
+ * Return:
+ * >= 0 on success (swap type).
+ * -EINVAL if @device is invalid.
+ * -ENODEV if the swap device is not found.
+ * -EBUSY if the device is already pinned for hibernation.
+ */
+int pin_hibernation_swap_type(dev_t device, sector_t offset)
+{
+ int type;
+ struct swap_info_struct *si;
+
+ spin_lock(&swap_lock);
+
+ type = __find_hibernation_swap_type(device, offset);
+ if (type < 0) {
+ spin_unlock(&swap_lock);
+ return type;
+ }
+
+ si = swap_type_to_info(type);
+ if (WARN_ON_ONCE(!si)) {
+ spin_unlock(&swap_lock);
+ return -ENODEV;
+ }
+
+ /*
+ * hibernate_acquire() prevents concurrent hibernation sessions.
+ * This check additionally guards against double-pinning within
+ * the same session.
+ */
+ if (WARN_ON_ONCE(si->flags & SWP_HIBERNATION)) {
+ spin_unlock(&swap_lock);
+ return -EBUSY;
+ }
+
+ si->flags |= SWP_HIBERNATION;
+
+ spin_unlock(&swap_lock);
+ return type;
+}
+
+/**
+ * unpin_hibernation_swap_type - Unpin the swap device for hibernation
+ * @type: Swap type previously returned by pin_hibernation_swap_type()
+ *
+ * Clear the hibernation pin on the given swap device, allowing
+ * swapoff() to proceed normally.
+ *
+ * If @type does not refer to a valid swap device, this function
+ * does nothing.
+ */
+void unpin_hibernation_swap_type(int type)
+{
+ struct swap_info_struct *si;
+
+ spin_lock(&swap_lock);
+ si = swap_type_to_info(type);
+ if (!si) {
+ spin_unlock(&swap_lock);
+ return;
+ }
+ si->flags &= ~SWP_HIBERNATION;
+ spin_unlock(&swap_lock);
+}
+
+/**
+ * find_hibernation_swap_type - Find swap type for hibernation
+ * @device: Block device containing the resume image
+ * @offset: Offset within the device identifying the swap area
+ *
+ * Locate the swap device corresponding to @device and @offset.
+ *
+ * Unlike pin_hibernation_swap_type(), this function only performs a
+ * lookup and does not mark the swap device as pinned for hibernation.
+ *
+ * This is safe in the sysfs-based hibernation path where user space
+ * is already frozen and swapoff() cannot run concurrently.
+ *
+ * Return:
+ * A non-negative swap type on success.
+ * -EINVAL if @device is invalid.
+ * -ENODEV if no matching swap device is found.
+ */
+int find_hibernation_swap_type(dev_t device, sector_t offset)
+{
+ int type;
+
+ spin_lock(&swap_lock);
+ type = __find_hibernation_swap_type(device, offset);
+ spin_unlock(&swap_lock);
+
+ return type;
+}
+
int find_first_swap(dev_t *device)
{
int type;
@@ -2869,7 +3029,7 @@ static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
ci = cluster_info + i;
/* Cluster with bad marks count will have a remaining table */
spin_lock(&ci->lock);
- if (rcu_dereference_protected(ci->table, true)) {
+ if (cluster_table_is_alloced(ci)) {
swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true);
swap_cluster_free_table(ci);
}
@@ -2903,7 +3063,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
- unsigned long *zeromap;
struct swap_cluster_info *cluster_info;
struct file *swap_file, *victim;
struct address_space *mapping;
@@ -2936,6 +3095,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
+
+ /* Refuse swapoff while the device is pinned for hibernation */
+ if (p->flags & SWP_HIBERNATION) {
+ err = -EBUSY;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+
if (!security_vm_enough_memory_mm(current->mm, p->pages))
vm_unacct_memory(p->pages);
else {
@@ -2991,8 +3158,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_file = p->swap_file;
p->swap_file = NULL;
- zeromap = p->zeromap;
- p->zeromap = NULL;
maxpages = p->max;
cluster_info = p->cluster_info;
p->max = 0;
@@ -3004,10 +3169,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
kfree(p->global_cluster);
p->global_cluster = NULL;
- kvfree(zeromap);
free_swap_cluster_info(cluster_info, maxpages);
- /* Destroy swap account information */
- swap_cgroup_swapoff(p->type);
inode = mapping->host;
@@ -3538,21 +3700,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- error = swap_cgroup_swapon(si->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
- /*
- * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
- * be above MAX_PAGE_ORDER incase of a large swap file.
- */
- si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
- GFP_KERNEL | __GFP_ZERO);
- if (!si->zeromap) {
- error = -ENOMEM;
- goto bad_swap_unlock_inode;
- }
-
if (si->bdev && bdev_stable_writes(si->bdev))
si->flags |= SWP_STABLE_WRITES;
@@ -3652,11 +3799,8 @@ bad_swap:
si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si, swap_file);
- swap_cgroup_swapoff(si->type);
free_swap_cluster_info(si->cluster_info, si->max);
si->cluster_info = NULL;
- kvfree(si->zeromap);
- si->zeromap = NULL;
/*
* Clear the SWP_USED flag after all resources are freed so
* alloc_swap_info can reuse this si safely.
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 80cc8be5725f..246af12bf801 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -2,7 +2,12 @@
/*
* mm/userfaultfd.c
*
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
* Copyright (C) 2015 Red Hat, Inc.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
*/
#include <linux/mm.h>
@@ -14,6 +19,17 @@
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/miscdevice.h>
+#include <linux/uio.h>
#include <linux/file.h>
#include <linux/cleanup.h>
#include <asm/tlbflush.h>
@@ -1017,7 +1033,7 @@ out:
return copied ? copied : err;
}
-ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+static ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
uffd_flags_t flags)
{
@@ -1025,7 +1041,7 @@ ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+static ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long start,
unsigned long len)
{
@@ -1033,7 +1049,7 @@ ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+static ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, uffd_flags_t flags)
{
@@ -1049,7 +1065,7 @@ ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+static ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, uffd_flags_t flags)
{
return mfill_atomic(ctx, start, 0, len,
@@ -1085,7 +1101,7 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}
-int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+static int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp)
{
struct mm_struct *dst_mm = ctx->mm;
@@ -1915,7 +1931,7 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma,
* in the regions or not, but preventing the risk of having to split
* the hugepmd during the remap.
*/
-ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+static ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, __u64 mode)
{
struct mm_struct *mm = ctx->mm;
@@ -2090,7 +2106,7 @@ out:
return moved ? moved : err;
}
-bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
+static bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
bool wp_async)
{
const struct vm_uffd_ops *ops = vma_uffd_ops(vma);
@@ -2147,12 +2163,12 @@ static void userfaultfd_set_ctx(struct vm_area_struct *vma,
(vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
}
-void userfaultfd_reset_ctx(struct vm_area_struct *vma)
+static void userfaultfd_reset_ctx(struct vm_area_struct *vma)
{
userfaultfd_set_ctx(vma, NULL, 0);
}
-struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
+static struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
@@ -2191,7 +2207,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
}
/* Assumes mmap write lock taken, and mm_struct pinned. */
-int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
+static int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
vm_flags_t vm_flags,
unsigned long start, unsigned long end,
@@ -2255,7 +2271,7 @@ skip:
return 0;
}
-void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
+static void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma;
@@ -2270,7 +2286,7 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
mmap_write_unlock(mm);
}
-void userfaultfd_release_all(struct mm_struct *mm,
+static void userfaultfd_release_all(struct mm_struct *mm,
struct userfaultfd_ctx *ctx)
{
struct vm_area_struct *vma, *prev;
@@ -2305,3 +2321,2222 @@ void userfaultfd_release_all(struct mm_struct *mm,
mmap_write_unlock(mm);
mmput(mm);
}
+
+static int sysctl_unprivileged_userfaultfd __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table vm_userfaultfd_table[] = {
+ {
+ .procname = "unprivileged_userfaultfd",
+ .data = &sysctl_unprivileged_userfaultfd,
+ .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+#endif
+
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
+
+struct userfaultfd_fork_ctx {
+ struct userfaultfd_ctx *orig;
+ struct userfaultfd_ctx *new;
+ struct list_head list;
+};
+
+struct userfaultfd_unmap_ctx {
+ struct userfaultfd_ctx *ctx;
+ unsigned long start;
+ unsigned long end;
+ struct list_head list;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_entry_t wq;
+ struct userfaultfd_ctx *ctx;
+ bool waken;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return false;
+
+ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ WRITE_ONCE(uwq->waken, true);
+ /*
+ * The Program-Order guarantees provided by the scheduler
+ * ensure uwq->waken is visible before the task is woken.
+ */
+ ret = wake_up_state(wq->private, mode);
+ if (ret) {
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the other
+ * CPUs, the waitqueue may disappear from under us, see the
+ * !list_empty_careful() in handle_userfault().
+ *
+ * try_to_wake_up() has an implicit smp_mb(), and the
+ * wq->private is read before calling the extern function
+ * "wake_up_state" (which in turns calls try_to_wake_up).
+ */
+ list_del_init(&wq->entry);
+ }
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ refcount_inc(&ctx->refcount);
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->refcount)) {
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
+ mmdrop(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned long real_address,
+ unsigned int flags,
+ unsigned long reason,
+ unsigned int features)
+{
+ struct uffd_msg msg;
+
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+
+ msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
+ real_address : address;
+
+ /*
+ * These flags indicate why the userfault occurred:
+ * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+ * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+ * - Neither of these flags being set indicates a MISSING fault.
+ *
+ * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+ * fault. Otherwise, it was a read fault.
+ */
+ if (flags & FAULT_FLAG_WRITE)
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ if (reason & VM_UFFD_MINOR)
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
+ if (features & UFFD_FEATURE_THREAD_ID)
+ msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
+ return msg;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Same functionality as userfaultfd_must_wait below with modifications for
+ * hugepmd ranges.
+ */
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_fault *vmf,
+ unsigned long reason)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ pte_t *ptep, pte;
+
+ assert_fault_locked(vmf);
+
+ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
+ if (!ptep)
+ return true;
+
+ pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
+
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (huge_pte_none(pte))
+ return true;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(pte))
+ return true;
+ /*
+ * Concurrent migration may have replaced the present PTE with a
+ * non-marker swap entry between fault delivery and this lockless
+ * re-check. huge_pte_write() on a swap entry decodes random offset
+ * bits, so gate it on pte_present(). The migration completion path
+ * will re-deliver the fault if it still needs userspace.
+ */
+ if (!pte_present(pte))
+ return false;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
+ */
+ if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
+ return true;
+
+ return false;
+}
+#else
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_fault *vmf,
+ unsigned long reason)
+{
+ /* Should never get here. */
+ VM_WARN_ON_ONCE(1);
+ return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Verify the pagetables are still not ok after having registered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read_iter and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ struct vm_fault *vmf,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ unsigned long address = vmf->address;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pte_t ptent;
+ bool ret;
+
+ assert_fault_locked(vmf);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return true;
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return true;
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return true;
+ pmd = pmd_offset(pud, address);
+again:
+ _pmd = pmdp_get_lockless(pmd);
+ if (pmd_none(_pmd))
+ return true;
+
+ /*
+ * A race could arise which would result in a softleaf entry such as
+ * migration entry unexpectedly being present in the PMD, so explicitly
+ * check for this and bail out if so.
+ */
+ if (!pmd_present(_pmd))
+ return false;
+
+ if (pmd_trans_huge(_pmd))
+ return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+
+ pte = pte_offset_map(pmd, address);
+ if (!pte)
+ goto again;
+
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ ptent = ptep_get(pte);
+
+ ret = true;
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (pte_none(ptent))
+ goto out;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(ptent))
+ goto out;
+ /*
+ * Concurrent swap-out / migration may have replaced the present PTE
+ * with a non-marker swap entry between fault delivery and this
+ * lockless re-check. pte_write() on a swap entry decodes random
+ * offset bits, so gate it on pte_present(). The page-in path will
+ * re-deliver the fault if it still needs userspace.
+ */
+ if (!pte_present(ptent)) {
+ ret = false;
+ goto out;
+ }
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
+ */
+ if (!pte_write(ptent) && (reason & VM_UFFD_WP))
+ goto out;
+
+ ret = false;
+out:
+ pte_unmap(pte);
+ return ret;
+}
+
+static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
+{
+ if (flags & FAULT_FLAG_INTERRUPTIBLE)
+ return TASK_INTERRUPTIBLE;
+
+ if (flags & FAULT_FLAG_KILLABLE)
+ return TASK_KILLABLE;
+
+ return TASK_UNINTERRUPTIBLE;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_lock must be released before
+ * returning it.
+ */
+vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+ bool must_wait;
+ unsigned int blocking_state;
+
+ /*
+ * We don't do userfault handling for the final child pid update
+ * and when coredumping (faults triggered by get_dump_page()).
+ */
+ if (current->flags & (PF_EXITING|PF_DUMPCORE))
+ goto out;
+
+ assert_fault_locked(vmf);
+
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ VM_WARN_ON_ONCE(ctx->mm != mm);
+
+ /* Any unrecognized flag is a bug. */
+ VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
+ /* 0 or > 1 flags set is a bug; we expect exactly 1. */
+ VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
+
+ if (ctx->features & UFFD_FEATURE_SIGBUS)
+ goto out;
+ if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
+ /* take the reference before dropping the mmap_lock */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
+ reason, ctx->features);
+ uwq.ctx = ctx;
+ uwq.waken = false;
+
+ blocking_state = userfaultfd_get_blocking_state(vmf->flags);
+
+ /*
+ * Take the vma lock now, in order to safely call
+ * userfaultfd_huge_must_wait() later. Since acquiring the
+ * (sleepable) vma lock can modify the current task state, that
+ * must be before explicitly calling set_current_state().
+ */
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_lock_read(vma);
+
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(blocking_state);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+
+ if (is_vm_hugetlb_page(vma)) {
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
+ }
+
+ release_fault_lock(vmf);
+
+ if (likely(must_wait && !READ_ONCE(ctx->released))) {
+ wake_up_poll(&ctx->fd_wqh, EPOLLIN);
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.entry)) {
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.entry);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
+{
+ struct userfaultfd_ctx *release_new_ctx;
+
+ if (WARN_ON_ONCE(current->flags & PF_EXITING))
+ goto out;
+
+ ewq->ctx = ctx;
+ init_waitqueue_entry(&ewq->wq, current);
+ release_new_ctx = NULL;
+
+ spin_lock_irq(&ctx->event_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->event_wqh, &ewq->wq);
+ for (;;) {
+ set_current_state(TASK_KILLABLE);
+ if (ewq->msg.event == 0)
+ break;
+ if (READ_ONCE(ctx->released) ||
+ fatal_signal_pending(current)) {
+ /*
+ * &ewq->wq may be queued in fork_event, but
+ * __remove_wait_queue ignores the head
+ * parameter. It would be a problem if it
+ * didn't.
+ */
+ __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+ if (ewq->msg.event == UFFD_EVENT_FORK) {
+ struct userfaultfd_ctx *new;
+
+ new = (struct userfaultfd_ctx *)
+ (unsigned long)
+ ewq->msg.arg.reserved.reserved1;
+ release_new_ctx = new;
+ }
+ break;
+ }
+
+ spin_unlock_irq(&ctx->event_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, EPOLLIN);
+ schedule();
+
+ spin_lock_irq(&ctx->event_wqh.lock);
+ }
+ __set_current_state(TASK_RUNNING);
+ spin_unlock_irq(&ctx->event_wqh.lock);
+
+ if (release_new_ctx) {
+ userfaultfd_release_new(release_new_ctx);
+ userfaultfd_ctx_put(release_new_ctx);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+out:
+ atomic_dec(&ctx->mmap_changing);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
+ userfaultfd_ctx_put(ctx);
+}
+
+static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *ewq)
+{
+ ewq->msg.event = 0;
+ wake_up_locked(&ctx->event_wqh);
+ __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+}
+
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+ struct userfaultfd_ctx *ctx = NULL, *octx;
+ struct userfaultfd_fork_ctx *fctx;
+
+ octx = vma->vm_userfaultfd_ctx.ctx;
+ if (!octx)
+ return 0;
+
+ if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ userfaultfd_reset_ctx(vma);
+ return 0;
+ }
+
+ list_for_each_entry(fctx, fcs, list)
+ if (fctx->orig == octx) {
+ ctx = fctx->new;
+ break;
+ }
+
+ if (!ctx) {
+ fctx = kmalloc_obj(*fctx);
+ if (!fctx)
+ return -ENOMEM;
+
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx) {
+ kfree(fctx);
+ return -ENOMEM;
+ }
+
+ refcount_set(&ctx->refcount, 1);
+ ctx->flags = octx->flags;
+ ctx->features = octx->features;
+ ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
+ atomic_set(&ctx->mmap_changing, 0);
+ ctx->mm = vma->vm_mm;
+ mmgrab(ctx->mm);
+
+ userfaultfd_ctx_get(octx);
+ down_write(&octx->map_changing_lock);
+ atomic_inc(&octx->mmap_changing);
+ up_write(&octx->map_changing_lock);
+ fctx->orig = octx;
+ fctx->new = ctx;
+ list_add_tail(&fctx->list, fcs);
+ }
+
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+ return 0;
+}
+
+static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+ struct userfaultfd_ctx *ctx = fctx->orig;
+ struct userfaultfd_wait_queue ewq;
+
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_FORK;
+ ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+ userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ dup_fctx(fctx);
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ /*
+ * An error has occurred on fork, we will tear memory down, but have
+ * allocated memory for fctx's and raised reference counts for both the
+ * original and child contexts (and on the mm for each as a result).
+ *
+ * These would ordinarily be taken care of by a user handling the event,
+ * but we are no longer doing so, so manually clean up here.
+ *
+ * mm tear down will take care of cleaning up VMA contexts.
+ */
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ struct userfaultfd_ctx *octx = fctx->orig;
+ struct userfaultfd_ctx *ctx = fctx->new;
+
+ atomic_dec(&octx->mmap_changing);
+ VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
+ userfaultfd_ctx_put(octx);
+ userfaultfd_ctx_put(ctx);
+
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx;
+
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
+ vm_ctx->ctx = ctx;
+ userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
+ atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
+ } else {
+ /* Drop uffd context if remap feature not enabled */
+ userfaultfd_reset_ctx(vma);
+ }
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
+ unsigned long from, unsigned long to,
+ unsigned long len)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+ struct userfaultfd_wait_queue ewq;
+
+ if (!ctx)
+ return;
+
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_REMAP;
+ ewq.msg.arg.remap.from = from;
+ ewq.msg.arg.remap.to = to;
+ ewq.msg.arg.remap.len = len;
+
+ userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+ if (!ctx)
+ return;
+
+ atomic_dec(&ctx->mmap_changing);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
+ userfaultfd_ctx_put(ctx);
+}
+
+bool userfaultfd_remove(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue ewq;
+
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
+ return true;
+
+ userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
+ atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
+ mmap_read_unlock(mm);
+
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_REMOVE;
+ ewq.msg.arg.remove.start = start;
+ ewq.msg.arg.remove.end = end;
+
+ userfaultfd_event_wait_completion(ctx, &ewq);
+
+ return false;
+}
+
+static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
+ unsigned long start, unsigned long end)
+{
+ struct userfaultfd_unmap_ctx *unmap_ctx;
+
+ list_for_each_entry(unmap_ctx, unmaps, list)
+ if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
+ unmap_ctx->end == end)
+ return true;
+
+ return false;
+}
+
+int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct list_head *unmaps)
+{
+ struct userfaultfd_unmap_ctx *unmap_ctx;
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+ has_unmap_ctx(ctx, unmaps, start, end))
+ return 0;
+
+ unmap_ctx = kzalloc_obj(*unmap_ctx);
+ if (!unmap_ctx)
+ return -ENOMEM;
+
+ userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
+ atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
+ unmap_ctx->ctx = ctx;
+ unmap_ctx->start = start;
+ unmap_ctx->end = end;
+ list_add_tail(&unmap_ctx->list, unmaps);
+
+ return 0;
+}
+
+void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
+{
+ struct userfaultfd_unmap_ctx *ctx, *n;
+ struct userfaultfd_wait_queue ewq;
+
+ list_for_each_entry_safe(ctx, n, uf, list) {
+ msg_init(&ewq.msg);
+
+ ewq.msg.event = UFFD_EVENT_UNMAP;
+ ewq.msg.arg.remove.start = ctx->start;
+ ewq.msg.arg.remove.end = ctx->end;
+
+ userfaultfd_event_wait_completion(ctx->ctx, &ewq);
+
+ list_del(&ctx->list);
+ kfree(ctx);
+ }
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+
+ WRITE_ONCE(ctx->released, true);
+
+ userfaultfd_release_all(mm, ctx);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+
+ /* Flush pending events that may still wait on event_wqh */
+ wake_up_all(&ctx->event_wqh);
+
+ wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault_in(
+ wait_queue_head_t *wqh)
+{
+ wait_queue_entry_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ lockdep_assert_held(&wqh->lock);
+
+ uwq = NULL;
+ if (!waitqueue_active(wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&wqh->head, typeof(*wq), entry);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ return find_userfault_in(&ctx->fault_pending_wqh);
+}
+
+static inline struct userfaultfd_wait_queue *find_userfault_evt(
+ struct userfaultfd_ctx *ctx)
+{
+ return find_userfault_in(&ctx->event_wqh);
+}
+
+static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ __poll_t ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ if (!userfaultfd_is_initialized(ctx))
+ return EPOLLERR;
+
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return EPOLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = EPOLLIN;
+ else if (waitqueue_active(&ctx->event_wqh))
+ ret = EPOLLIN;
+
+ return ret;
+}
+
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *new,
+ struct inode *inode,
+ struct uffd_msg *msg)
+{
+ int fd;
+
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
+ if (fd < 0)
+ return fd;
+
+ msg->arg.reserved.reserved1 = 0;
+ msg->arg.fork.ufd = fd;
+ return 0;
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg, struct inode *inode)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+ /*
+ * Handling fork event requires sleeping operations, so
+ * we drop the event_wqh lock, then do these ops, then
+ * lock it back and wake up the waiter. While the lock is
+ * dropped the ewq may go away so we keep track of it
+ * carefully.
+ */
+ LIST_HEAD(fork_event);
+ struct userfaultfd_ctx *fork_nctx = NULL;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock_irq(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.head list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.entry);
+ add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ write_seqcount_end(&ctx->refile_seq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ spin_lock(&ctx->event_wqh.lock);
+ uwq = find_userfault_evt(ctx);
+ if (uwq) {
+ *msg = uwq->msg;
+
+ if (uwq->msg.event == UFFD_EVENT_FORK) {
+ fork_nctx = (struct userfaultfd_ctx *)
+ (unsigned long)
+ uwq->msg.arg.reserved.reserved1;
+ list_move(&uwq->wq.entry, &fork_event);
+ /*
+ * fork_nctx can be freed as soon as
+ * we drop the lock, unless we take a
+ * reference on it.
+ */
+ userfaultfd_ctx_get(fork_nctx);
+ spin_unlock(&ctx->event_wqh.lock);
+ ret = 0;
+ break;
+ }
+
+ userfaultfd_event_complete(ctx, uwq);
+ spin_unlock(&ctx->event_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->event_wqh.lock);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock_irq(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock_irq(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock_irq(&ctx->fd_wqh.lock);
+
+ if (!ret && msg->event == UFFD_EVENT_FORK) {
+ ret = resolve_userfault_fork(fork_nctx, inode, msg);
+ spin_lock_irq(&ctx->event_wqh.lock);
+ if (!list_empty(&fork_event)) {
+ /*
+ * The fork thread didn't abort, so we can
+ * drop the temporary refcount.
+ */
+ userfaultfd_ctx_put(fork_nctx);
+
+ uwq = list_first_entry(&fork_event,
+ typeof(*uwq),
+ wq.entry);
+ /*
+ * If fork_event list wasn't empty and in turn
+ * the event wasn't already released by fork
+ * (the event is allocated on fork kernel
+ * stack), put the event back to its place in
+ * the event_wq. fork_event head will be freed
+ * as soon as we return so the event cannot
+ * stay queued there no matter the current
+ * "ret" value.
+ */
+ list_del(&uwq->wq.entry);
+ __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+
+ /*
+ * Leave the event in the waitqueue and report
+ * error to userland if we failed to resolve
+ * the userfault fork.
+ */
+ if (likely(!ret))
+ userfaultfd_event_complete(ctx, uwq);
+ } else {
+ /*
+ * Here the fork thread aborted and the
+ * refcount from the fork thread on fork_nctx
+ * has already been released. We still hold
+ * the reference we took before releasing the
+ * lock above. If resolve_userfault_fork
+ * failed we've to drop it because the
+ * fork_nctx has to be freed in such case. If
+ * it succeeded we'll hold it because the new
+ * uffd references it.
+ */
+ if (ret)
+ userfaultfd_ctx_put(fork_nctx);
+ }
+ spin_unlock_irq(&ctx->event_wqh.lock);
+ }
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ struct inode *inode = file_inode(file);
+ bool no_wait;
+
+ if (!userfaultfd_is_initialized(ctx))
+ return -EINVAL;
+
+ no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
+ for (;;) {
+ if (iov_iter_count(to) < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
+ if (_ret)
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = true;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned seq;
+ bool need_wakeup;
+
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or mmap_read_unlock(mm) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ if (start + len <= start)
+ return -EINVAL;
+ return 0;
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ vm_flags_t vm_flags;
+ bool found;
+ bool basic_ioctls;
+ unsigned long start, end;
+ struct vma_iterator vmi;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ if (!pgtable_supports_uffd_wp())
+ goto out;
+
+ vm_flags |= VM_UFFD_WP;
+ }
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+ goto out;
+#endif
+ vm_flags |= VM_UFFD_MINOR;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
+ ret = -EINVAL;
+ mmap_write_lock(mm);
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_find(&vmi, end);
+ if (!vma)
+ goto out_unlock;
+
+ /*
+ * If the first vma contains huge pages, make sure start address
+ * is aligned to huge page size.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+ if (start & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+
+ /*
+ * Search for not compatible vmas.
+ */
+ found = false;
+ basic_ioctls = false;
+ cur = vma;
+ do {
+ cond_resched();
+
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (!vma_can_userfault(cur, vm_flags, wp_async))
+ goto out_unlock;
+
+ /*
+ * UFFDIO_COPY will fill file holes even without
+ * PROT_WRITE. This check enforces that if this is a
+ * MAP_SHARED, the process has write permission to the backing
+ * file. If VM_MAYWRITE is set it also enforces that on a
+ * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
+ * F_WRITE_SEAL can be taken until the vma is destroyed.
+ */
+ ret = -EPERM;
+ if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
+ goto out_unlock;
+
+ /*
+ * If this vma contains ending address, and huge pages
+ * check alignment.
+ */
+ if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+ end > cur->vm_start) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+ ret = -EINVAL;
+
+ if (end & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+ if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ /*
+ * Note vmas containing huge pages
+ */
+ if (is_vm_hugetlb_page(cur))
+ basic_ioctls = true;
+
+ found = true;
+ } for_each_vma_range(vmi, cur, end);
+ VM_WARN_ON_ONCE(!found);
+
+ ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
+ wp_async);
+
+out_unlock:
+ mmap_write_unlock(mm);
+ mmput(mm);
+ if (!ret) {
+ __u64 ioctls_out;
+
+ ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
+ UFFD_API_RANGE_IOCTLS;
+
+ /*
+ * Declare the WP ioctl only if the WP mode is
+ * specified and all checks passed with the range
+ */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
+
+ /* CONTINUE ioctl is only supported for MINOR ranges. */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(ioctls_out, &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+ struct vma_iterator vmi;
+ bool wp_async = userfaultfd_wp_async_ctx(ctx);
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ ret = -ENOMEM;
+ if (!mmget_not_zero(mm))
+ goto out;
+
+ mmap_write_lock(mm);
+ ret = -EINVAL;
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_find(&vmi, end);
+ if (!vma)
+ goto out_unlock;
+
+ /*
+ * If the first vma contains huge pages, make sure start address
+ * is aligned to huge page size.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+ if (start & (vma_hpagesize - 1))
+ goto out_unlock;
+ }
+
+ /*
+ * Search for not compatible vmas.
+ */
+ found = false;
+ cur = vma;
+ do {
+ cond_resched();
+
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /*
+ * Prevent unregistering through a different userfaultfd than
+ * the one used for registration.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
+ goto out_unlock;
+
+ found = true;
+ } for_each_vma_range(vmi, cur, end);
+ VM_WARN_ON_ONCE(!found);
+
+ vma_iter_set(&vmi, start);
+ prev = vma_prev(&vmi);
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ for_each_vma_range(vmi, vma, end) {
+ cond_resched();
+
+ /* VMA not registered with userfaultfd. */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ if (userfaultfd_missing(vma)) {
+ /*
+ * Wake any concurrent pending userfault while
+ * we unregister, so they will not hang
+ * permanently and it avoids userland to call
+ * UFFDIO_WAKE explicitly.
+ */
+ struct userfaultfd_wake_range range;
+ range.start = start;
+ range.len = vma_end - start;
+ wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+ }
+
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ start, vma_end);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ break;
+ }
+
+skip:
+ prev = vma;
+ start = vma->vm_end;
+ }
+
+out_unlock:
+ mmap_write_unlock(mm);
+ mmput(mm);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_WARN_ON_ONCE(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
+ if (ret)
+ goto out;
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
+ goto out;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, flags);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ VM_WARN_ON_ONCE(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ VM_WARN_ON_ONCE(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_writeprotect uffdio_wp;
+ struct uffdio_writeprotect __user *user_uffdio_wp;
+ struct userfaultfd_wake_range range;
+ bool mode_wp, mode_dontwake;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+ if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+ sizeof(struct uffdio_writeprotect)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+ UFFDIO_WRITEPROTECT_MODE_WP))
+ return -EINVAL;
+
+ mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
+ mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+
+ if (mode_wp && mode_dontwake)
+ return -EINVAL;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (ret)
+ return ret;
+
+ if (!mode_wp && !mode_dontwake) {
+ range.start = uffdio_wp.range.start;
+ range.len = uffdio_wp.range.len;
+ wake_userfault(ctx, &range);
+ }
+ return ret;
+}
+
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_continue uffdio_continue;
+ struct uffdio_continue __user *user_uffdio_continue;
+ struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
+
+ user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+ return -EFAULT;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+ /* don't copy the output fields */
+ sizeof(uffdio_continue) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
+ goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+ uffdio_continue.range.len, flags);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON_ONCE(!ret);
+ range.len = ret;
+ if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+ range.start = uffdio_continue.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
+static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_poison uffdio_poison;
+ struct uffdio_poison __user *user_uffdio_poison;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_poison = (struct uffdio_poison __user *)arg;
+
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+ /* don't copy the output fields */
+ sizeof(uffdio_poison) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+ uffdio_poison.range.len, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON_ONCE(!ret);
+ range.len = ret;
+ if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
+ range.start = uffdio_poison.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
+static inline unsigned int uffd_ctx_features(__u64 user_features)
+{
+ /*
+ * For the current set of features the bits just coincide. Set
+ * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
+ */
+ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
+}
+
+static int userfaultfd_move(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_move uffdio_move;
+ struct uffdio_move __user *user_uffdio_move;
+ struct userfaultfd_wake_range range;
+ struct mm_struct *mm = ctx->mm;
+
+ user_uffdio_move = (struct uffdio_move __user *) arg;
+
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ goto out;
+ }
+
+ if (copy_from_user(&uffdio_move, user_uffdio_move,
+ /* don't copy "move" last field */
+ sizeof(uffdio_move)-sizeof(__s64)))
+ return -EFAULT;
+
+ /* Do not allow cross-mm moves. */
+ if (mm != current->mm)
+ return -EINVAL;
+
+ ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
+ UFFDIO_MOVE_MODE_DONTWAKE))
+ return -EINVAL;
+
+ if (mmget_not_zero(mm)) {
+ ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
+ mmput(mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
+ range.start = uffdio_move.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ unsigned int ctx_features;
+ int ret;
+ __u64 features;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ features = uffdio_api.features;
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API)
+ goto err_out;
+ ret = -EPERM;
+ if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+ goto err_out;
+
+ /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ if (features & UFFD_FEATURE_WP_ASYNC)
+ features |= UFFD_FEATURE_WP_UNPOPULATED;
+
+ /* report all available features and ioctls to userland */
+ uffdio_api.features = UFFD_API_FEATURES;
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+#endif
+ if (!pgtable_supports_uffd_wp())
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+
+ if (!uffd_supports_wp_marker()) {
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
+ }
+
+ ret = -EINVAL;
+ if (features & ~uffdio_api.features)
+ goto err_out;
+
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+
+ /* only enable the requested features for this uffd context */
+ ctx_features = uffd_ctx_features(features);
+ ret = -EINVAL;
+ if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+ goto err_out;
+
+ ret = 0;
+out:
+ return ret;
+err_out:
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ ret = -EFAULT;
+ goto out;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
+ return -EINVAL;
+
+ switch (cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ case UFFDIO_MOVE:
+ ret = userfaultfd_move(ctx, arg);
+ break;
+ case UFFDIO_WRITEPROTECT:
+ ret = userfaultfd_writeprotect(ctx, arg);
+ break;
+ case UFFDIO_CONTINUE:
+ ret = userfaultfd_continue(ctx, arg);
+ break;
+ case UFFDIO_POISON:
+ ret = userfaultfd_poison(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_entry_t *wq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
+ total++;
+ }
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, ctx->features,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read_iter = userfaultfd_read_iter,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->event_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
+}
+
+static int new_userfaultfd(int flags)
+{
+ struct userfaultfd_ctx *ctx __free(kfree) = NULL;
+
+ VM_WARN_ON_ONCE(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
+
+ if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
+ return -EINVAL;
+
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->features = 0;
+ ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
+ atomic_set(&ctx->mmap_changing, 0);
+ ctx->mm = current->mm;
+
+ FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
+ anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
+ NULL));
+ if (fdf.err)
+ return fdf.err;
+
+ /* prevent the mm struct to be freed */
+ mmgrab(ctx->mm);
+ fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
+}
+
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+ /* Userspace-only page faults are always allowed */
+ if (flags & UFFD_USER_MODE_ONLY)
+ return true;
+
+ /*
+ * The user is requesting a userfaultfd which can handle kernel faults.
+ * Privileged users are always allowed to do this.
+ */
+ if (capable(CAP_SYS_PTRACE))
+ return true;
+
+ /* Otherwise, access to kernel fault handling is sysctl controlled. */
+ return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ if (!userfaultfd_syscall_allowed(flags))
+ return -EPERM;
+
+ return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+ if (cmd != USERFAULTFD_IOC_NEW)
+ return -EINVAL;
+
+ return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+ .unlocked_ioctl = userfaultfd_dev_ioctl,
+ .compat_ioctl = userfaultfd_dev_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "userfaultfd",
+ .fops = &userfaultfd_dev_fops
+};
+
+static int __init userfaultfd_init(void)
+{
+ int ret;
+
+ ret = misc_register(&userfaultfd_misc);
+ if (ret)
+ return ret;
+
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", vm_userfaultfd_table);
+#endif
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/mm/util.c b/mm/util.c
index 3cc949a0b7ed..af2c2103f0d9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1192,6 +1192,7 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc,
desc->vm_file = vma->vm_file;
desc->vma_flags = vma->flags;
desc->page_prot = vma->vm_page_prot;
+ desc->vm_ops = vma->vm_ops;
/* Default. */
desc->action.type = MMAP_NOTHING;
@@ -1396,8 +1397,6 @@ static int mmap_action_finish(struct vm_area_struct *vma,
if (!err)
err = call_vma_mapped(vma);
- if (!err && action->success_hook)
- err = action->success_hook(vma);
/* do_munmap() might take rmap lock, so release if held. */
maybe_rmap_unlock_action(vma, action);
@@ -1415,16 +1414,22 @@ static int mmap_action_finish(struct vm_area_struct *vma,
*/
len = vma_pages(vma) << PAGE_SHIFT;
do_munmap(current->mm, vma->vm_start, len, NULL);
- if (action->error_hook) {
- /* We may want to filter the error. */
- err = action->error_hook(err);
- /* The caller should not clear the error. */
- VM_WARN_ON_ONCE(!err);
- }
- return err;
+
+ return action->error_override ?: err;
}
#ifdef CONFIG_MMU
+
+static int check_mmap_action(struct mmap_action *action)
+{
+ const unsigned long override = action->error_override;
+
+ if (WARN_ON_ONCE(override && !IS_ERR_VALUE(override)))
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* mmap_action_prepare - Perform preparatory setup for an VMA descriptor
* action which need to be performed.
@@ -1434,7 +1439,14 @@ static int mmap_action_finish(struct vm_area_struct *vma,
*/
int mmap_action_prepare(struct vm_area_desc *desc)
{
- switch (desc->action.type) {
+ struct mmap_action *action = &desc->action;
+ int err;
+
+ err = check_mmap_action(action);
+ if (err)
+ return err;
+
+ switch (action->type) {
case MMAP_NOTHING:
return 0;
case MMAP_REMAP_PFN:
diff --git a/mm/vma.c b/mm/vma.c
index d90791b00a7b..9eea2850818a 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2697,6 +2697,8 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma,
{
if (map->vm_ops)
vma->vm_ops = map->vm_ops;
+ else /* Only /dev/zero should do this. */
+ vma_set_anonymous(vma);
vma->vm_private_data = map->vm_private_data;
}
@@ -2744,6 +2746,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
.action = {
.type = MMAP_NOTHING, /* Default to no further action. */
},
+ .vm_ops = &vma_dummy_vm_ops,
};
bool allocated_new = false;
int error;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bb6ae08d18f5..1afca3568b9b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr)
schedule_work(&p->wq);
}
+/*
+ * vm_area_free_pages - free a range of pages from a vmalloc allocation
+ * @vm: the vm_struct containing the pages
+ * @start_idx: first page index to free (inclusive)
+ * @end_idx: last page index to free (exclusive)
+ *
+ * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting.
+ * Freed vm->pages[] entries are set to NULL.
+ * Caller is responsible for unmapping (vunmap_range) and KASAN
+ * poisoning before calling this.
+ */
+static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx,
+ unsigned int end_idx)
+{
+ unsigned int i;
+
+ if (!(vm->flags & VM_MAP_PUT_PAGES)) {
+ for (i = start_idx; i < end_idx; i++)
+ mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1);
+ }
+ free_pages_bulk(vm->pages + start_idx, end_idx - start_idx);
+
+ for (i = start_idx; i < end_idx; i++)
+ vm->pages[i] = NULL;
+}
+
/**
* vfree - Release memory allocated by vmalloc()
* @addr: Memory base address
@@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr)
void vfree(const void *addr)
{
struct vm_struct *vm;
- int i;
if (unlikely(in_interrupt())) {
vfree_atomic(addr);
@@ -3459,19 +3484,8 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
- for (i = 0; i < vm->nr_pages; i++) {
- struct page *page = vm->pages[i];
- BUG_ON(!page);
- /*
- * High-order allocs for huge vmallocs are split, so
- * can be freed as an array of order-0 allocations
- */
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_lruvec_page_state(page, NR_VMALLOC, -1);
- __free_page(page);
- cond_resched();
- }
+ vm_area_free_pages(vm, 0, vm->nr_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -3939,7 +3953,7 @@ fail:
__GFP_NOFAIL | __GFP_ZERO |\
__GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
- GFP_USER | __GFP_NOLOCKDEP)
+ GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN)
static gfp_t vmalloc_fix_flags(gfp_t flags)
{
@@ -3980,6 +3994,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags)
*
* %__GFP_NOWARN can be used to suppress failure messages.
*
+ * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages
+ * (when prot=%PAGE_KERNEL).
+ *
* Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
@@ -3993,6 +4010,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
+ bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN);
if (WARN_ON_ONCE(!size))
return NULL;
@@ -4023,7 +4041,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
again:
area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
- gfp_mask, caller);
+ gfp_mask & ~__GFP_SKIP_KASAN, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
@@ -4041,7 +4059,7 @@ again:
* kasan_unpoison_vmalloc().
*/
if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
- if (kasan_hw_tags_enabled()) {
+ if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) {
/*
* Modify protection bits to allow tagging.
* This must be done before mapping.
@@ -4078,7 +4096,8 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
+ if (!skip_vmalloc_kasan)
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -4324,16 +4343,70 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
nid != page_to_nid(vmalloc_to_page(p)))
goto need_realloc;
+ } else {
+ /*
+ * If p is NULL, vrealloc behaves exactly like vmalloc.
+ * Skip the shrink and in-place grow paths.
+ */
+ goto need_realloc;
}
- /*
- * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
- * would be a good heuristic for when to shrink the vm_area?
- */
if (size <= old_size) {
+ unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
/* Zero out "freed" memory, potentially for future realloc. */
if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+
+ /*
+ * Free tail pages when shrink crosses a page boundary.
+ *
+ * Skip huge page allocations (page_order > 0) as partial
+ * freeing would require splitting.
+ *
+ * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must
+ * be reset before pages are returned to the allocator.
+ *
+ * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates
+ * mapping requests against the unchanged vm->size; freeing
+ * tail pages would cause vmalloc_to_page() to return NULL for
+ * the unmapped range.
+ *
+ * Skip if either GFP_NOFS or GFP_NOIO are used.
+ * kmemleak_free_part() internally allocates with
+ * GFP_KERNEL, which could trigger a recursive deadlock
+ * if we are under filesystem or I/O reclaim.
+ */
+ if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) &&
+ !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) &&
+ gfp_has_io_fs(flags)) {
+ unsigned long addr = (unsigned long)kasan_reset_tag(p);
+ unsigned int old_nr_pages = vm->nr_pages;
+
+ /*
+ * Use the node lock to synchronize with concurrent
+ * readers (vmalloc_info_show).
+ */
+ struct vmap_node *vn = addr_to_node(addr);
+
+ spin_lock(&vn->busy.lock);
+ vm->nr_pages = new_nr_pages;
+ spin_unlock(&vn->busy.lock);
+
+ /* Notify kmemleak of the reduced allocation size before unmapping. */
+ kmemleak_free_part(
+ (void *)addr + ((unsigned long)new_nr_pages
+ << PAGE_SHIFT),
+ (unsigned long)(old_nr_pages - new_nr_pages)
+ << PAGE_SHIFT);
+
+ vunmap_range(addr + ((unsigned long)new_nr_pages
+ << PAGE_SHIFT),
+ addr + ((unsigned long)old_nr_pages
+ << PAGE_SHIFT));
+
+ vm_area_free_pages(vm, new_nr_pages, old_nr_pages);
+ }
vm->requested_size = size;
kasan_vrealloc(p, old_size, size);
return (void *)p;
@@ -4342,7 +4415,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
/*
* We already have the bytes available in the allocation; use them.
*/
- if (size <= alloced_size) {
+ if (size <= vm->nr_pages << PAGE_SHIFT) {
/*
* No need to zero memory here, as unused memory will have
* already been zeroed at initial allocation time or during
@@ -4641,7 +4714,18 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
smp_rmb();
vaddr = (char *) va->va_start;
- size = vm ? get_vm_area_size(vm) : va_size(va);
+ if (vm)
+ /*
+ * For VM_ALLOC areas, use nr_pages rather than
+ * get_vm_area_size() because vrealloc() may shrink
+ * the mapping without updating area->size. Other
+ * mapping types (vmap, ioremap) don't set nr_pages.
+ */
+ size = (vm->flags & VM_ALLOC && vm->nr_pages) ?
+ (vm->nr_pages << PAGE_SHIFT) :
+ get_vm_area_size(vm);
+ else
+ size = va_size(va);
if (addr >= vaddr + size)
goto next_va;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 3fbb86996c4d..f053554e5826 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work)
/**
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
+ * @order: allocation order being reclaimed for
* @memcg: cgroup memory controller handle
* @tree: legacy subtree mode
* @scanned: number of pages scanned
@@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work)
*
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr;
@@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
level = vmpressure_calc_level(scanned, reclaimed);
- if (level > VMPRESSURE_LOW) {
+ /*
+ * Once we go above COSTLY_ORDER, reclaim relies heavily on
+ * compaction to make progress. Reclaim efficiency was never a
+ * great proxy for pressure to begin with, but it's outright
+ * misleading with these high orders. Don't throttle sockets
+ * because somebody is attempting something crazy like an order-7
+ * and predictably struggling.
+ */
+ if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) {
/*
* Let the socket buffer allocator know that
* we are having trouble reclaiming LRU pages.
@@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, true, vmpressure_win, 0);
+ vmpressure(gfp, 0, memcg, true, vmpressure_win, 0);
}
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 67231d3189ef..b21a15f36cce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -109,7 +109,7 @@ struct scan_control {
/* zone_reclaim_mode */
unsigned int may_unmap:1;
- /* zome_reclaim_mode, boost reclaim, cgroup restrictions */
+ /* zone_reclaim_mode, boost reclaim, cgroup restrictions */
unsigned int may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -169,11 +169,9 @@ struct scan_control {
struct {
unsigned int dirty;
- unsigned int unqueued_dirty;
unsigned int congested;
unsigned int writeback;
unsigned int immediate;
- unsigned int file_taken;
unsigned int taken;
} nr;
@@ -739,7 +737,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- memcg1_swapout(folio, swap);
+ __memcg1_swapout(folio, ci);
__swap_cache_del_folio(ci, folio, swap, shadow);
swap_cluster_unlock_irq(ci);
} else {
@@ -850,7 +848,11 @@ static bool lru_gen_set_refs(struct folio *folio)
return false;
}
- set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
+ /* Promote on second access */
+ if (folio_lru_refs(folio) > 1)
+ set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
+ else
+ folio_mark_accessed(folio);
return true;
}
#else
@@ -1944,6 +1946,44 @@ static int current_may_throttle(void)
return !(current->flags & PF_LOCAL_THROTTLE);
}
+static void handle_reclaim_writeback(unsigned long nr_taken,
+ struct pglist_data *pgdat,
+ struct scan_control *sc,
+ struct reclaim_stat *stat)
+{
+ /*
+ * If dirty folios are scanned that are not queued for IO, it
+ * implies that flushers are not doing their job. This can
+ * happen when memory pressure pushes dirty folios to the end of
+ * the LRU before the dirty limits are breached and the dirty
+ * data has expired. It can also happen when the proportion of
+ * dirty folios grows not through writes but through memory
+ * pressure reclaiming all the clean cache. And in some cases,
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep.
+ */
+ if (stat->nr_unqueued_dirty == nr_taken) {
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
+
+ sc->nr.dirty += stat->nr_dirty;
+ sc->nr.congested += stat->nr_congested;
+ sc->nr.writeback += stat->nr_writeback;
+ sc->nr.immediate += stat->nr_immediate;
+ sc->nr.taken += nr_taken;
+}
+
/*
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
@@ -2007,42 +2047,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
lruvec_lock_irq(lruvec);
lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
nr_scanned - nr_reclaimed);
-
- /*
- * If dirty folios are scanned that are not queued for IO, it
- * implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty folios to the end of
- * the LRU before the dirty limits are breached and the dirty
- * data has expired. It can also happen when the proportion of
- * dirty folios grows not through writes but through memory
- * pressure reclaiming all the clean cache. And in some cases,
- * the flushers simply cannot keep up with the allocation
- * rate. Nudge the flusher threads in case they are asleep.
- */
- if (stat.nr_unqueued_dirty == nr_taken) {
- wakeup_flusher_threads(WB_REASON_VMSCAN);
- /*
- * For cgroupv1 dirty throttling is achieved by waking up
- * the kernel flusher here and later waiting on folios
- * which are in writeback to finish (see shrink_folio_list()).
- *
- * Flusher may not be able to issue writeback quickly
- * enough for cgroupv1 writeback throttling to work
- * on a large system.
- */
- if (!writeback_throttling_sane(sc))
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
- }
-
- sc->nr.dirty += stat.nr_dirty;
- sc->nr.congested += stat.nr_congested;
- sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
- sc->nr.writeback += stat.nr_writeback;
- sc->nr.immediate += stat.nr_immediate;
- sc->nr.taken += nr_taken;
- if (file)
- sc->nr.file_taken += nr_taken;
-
+ handle_reclaim_writeback(nr_taken, pgdat, sc, &stat);
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
nr_scanned, nr_reclaimed, &stat, sc->priority, file);
return nr_reclaimed;
@@ -3220,7 +3225,7 @@ static int folio_update_gen(struct folio *folio, int gen)
}
/* protect pages accessed multiple times through file descriptors */
-static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio)
{
int type = folio_is_file_lru(folio);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
@@ -3239,9 +3244,6 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
- /* for folio_end_writeback() */
- if (reclaiming)
- new_flags |= BIT(PG_reclaim);
} while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
lru_gen_update_size(lruvec, folio, old_gen, new_gen);
@@ -3855,7 +3857,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
- new_gen = folio_inc_gen(lruvec, folio, false);
+ new_gen = folio_inc_gen(lruvec, folio);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
/* don't count the workingset being lazily promoted */
@@ -3878,10 +3880,9 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
+static void try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
- bool success = false;
bool seq_inc_flag = false;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
@@ -3907,11 +3908,10 @@ next:
/*
* If min_seq[type] of both anonymous and file is not increased,
- * we can directly return false to avoid unnecessary checking
- * overhead later.
+ * return here to avoid unnecessary checking overhead later.
*/
if (!seq_inc_flag)
- return success;
+ return;
/* see the comment on lru_gen_folio */
if (swappiness && swappiness <= MAX_SWAPPINESS) {
@@ -3929,10 +3929,7 @@ next:
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
- success = true;
}
-
- return success;
}
static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
@@ -4084,27 +4081,33 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}
-static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+static unsigned long lruvec_evictable_size(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
- unsigned long total = 0;
- int swappiness = get_swappiness(lruvec, sc);
+ unsigned long seq, total = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
for_each_evictable_type(type, swappiness) {
- unsigned long seq;
-
for (seq = min_seq[type]; seq <= max_seq; seq++) {
gen = lru_gen_from_seq(seq);
-
for (zone = 0; zone < MAX_NR_ZONES; zone++)
total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
+ return total;
+}
+
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+{
+ unsigned long total;
+ int swappiness = get_swappiness(lruvec, sc);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ total = lruvec_evictable_size(lruvec, swappiness);
+
/* whether the size is big enough to be helpful */
return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}
@@ -4577,7 +4580,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int tier_idx)
{
bool success;
- bool dirty, writeback;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -4607,7 +4609,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
/* protected */
if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
- gen = folio_inc_gen(lruvec, folio, false);
+ gen = folio_inc_gen(lruvec, folio);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
/* don't count the workingset being lazily promoted */
@@ -4622,26 +4624,11 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
/* ineligible */
if (zone > sc->reclaim_idx) {
- gen = folio_inc_gen(lruvec, folio, false);
+ gen = folio_inc_gen(lruvec, folio);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
- dirty = folio_test_dirty(folio);
- writeback = folio_test_writeback(folio);
- if (type == LRU_GEN_FILE && dirty) {
- sc->nr.file_taken += delta;
- if (!writeback)
- sc->nr.unqueued_dirty += delta;
- }
-
- /* waiting for writeback */
- if (writeback || (type == LRU_GEN_FILE && dirty)) {
- gen = folio_inc_gen(lruvec, folio, true);
- list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- return true;
- }
-
return false;
}
@@ -4649,12 +4636,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* swap constrained */
- if (!(sc->gfp_mask & __GFP_IO) &&
- (folio_test_dirty(folio) ||
- (folio_test_anon(folio) && !folio_test_swapcache(folio))))
- return false;
-
/* raced with release_pages() */
if (!folio_try_get(folio))
return false;
@@ -4669,9 +4650,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
if (!folio_test_referenced(folio))
set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0);
- /* for shrink_folio_list() */
- folio_clear_reclaim(folio);
-
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4680,7 +4658,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, int type, int tier,
- struct list_head *list)
+ struct list_head *list, int *isolatedp)
{
int i;
int gen;
@@ -4689,10 +4667,10 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
int scanned = 0;
int isolated = 0;
int skipped = 0;
- int scan_batch = min(nr_to_scan, MAX_LRU_BATCH);
- int remaining = scan_batch;
+ unsigned long remaining = nr_to_scan;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ VM_WARN_ON_ONCE(nr_to_scan > MAX_LRU_BATCH);
VM_WARN_ON_ONCE(!list_empty(list));
if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
@@ -4745,16 +4723,12 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
mod_lruvec_state(lruvec, item, isolated);
mod_lruvec_state(lruvec, PGREFILL, sorted);
mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated);
- trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch,
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
scanned, skipped, isolated,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
- if (type == LRU_GEN_FILE)
- sc->nr.file_taken += isolated;
- /*
- * There might not be eligible folios due to reclaim_idx. Check the
- * remaining to prevent livelock if it's not making progress.
- */
- return isolated || !remaining ? scanned : 0;
+
+ *isolatedp = isolated;
+ return scanned;
}
static int get_tier_idx(struct lruvec *lruvec, int type)
@@ -4798,33 +4772,41 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, int swappiness,
- int *type_scanned, struct list_head *list)
+ struct list_head *list, int *isolated,
+ int *isolate_type, int *isolate_scanned)
{
int i;
+ int total_scanned = 0;
int type = get_type_to_scan(lruvec, swappiness);
for_each_evictable_type(i, swappiness) {
int scanned;
int tier = get_tier_idx(lruvec, type);
- *type_scanned = type;
-
- scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
- if (scanned)
- return scanned;
+ scanned = scan_folios(nr_to_scan, lruvec, sc,
+ type, tier, list, isolated);
- type = !type;
+ total_scanned += scanned;
+ if (*isolated) {
+ *isolate_type = type;
+ *isolate_scanned = scanned;
+ break;
+ }
+ /*
+ * If scanned > 0 and isolated == 0, avoid falling back to the
+ * other type, as this type remains sufficient. Falling back
+ * too readily can disrupt the positive_ctrl_err() bias.
+ */
+ if (!scanned)
+ type = !type;
}
- return 0;
+ return total_scanned;
}
static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, int swappiness)
{
- int type;
- int scanned;
- int reclaimed;
LIST_HEAD(list);
LIST_HEAD(clean);
struct folio *folio;
@@ -4832,19 +4814,23 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
enum node_stat_item item;
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
+ int scanned, reclaimed;
+ int isolated = 0, type, type_scanned;
bool skip_retry = false;
- struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lruvec_lock_irq(lruvec);
- scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
+ /* In case folio deletion left empty old gens, flush them */
+ try_to_inc_min_seq(lruvec, swappiness);
- scanned += try_to_inc_min_seq(lruvec, swappiness);
+ scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness,
+ &list, &isolated, &type, &type_scanned);
- if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
- scanned = 0;
+ /* Scanning may have emptied the oldest gen, flush it */
+ if (scanned)
+ try_to_inc_min_seq(lruvec, swappiness);
lruvec_unlock_irq(lruvec);
@@ -4852,10 +4838,12 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
return scanned;
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
- sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
+ /* Retry pass is only meant for clean folios without new isolation */
+ if (isolated)
+ handle_reclaim_writeback(isolated, pgdat, sc, &stat);
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
- scanned, reclaimed, &stat, sc->priority,
+ type_scanned, reclaimed, &stat, sc->priority,
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
@@ -4900,6 +4888,7 @@ retry:
if (!list_empty(&list)) {
skip_retry = true;
+ isolated = 0;
goto retry;
}
@@ -4907,63 +4896,37 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- int swappiness, unsigned long *nr_to_scan)
+ struct scan_control *sc, int swappiness)
{
- int gen, type, zone;
- unsigned long size = 0;
- struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- *nr_to_scan = 0;
/* have to run aging, since eviction is not possible anymore */
if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- for_each_evictable_type(type, swappiness) {
- unsigned long seq;
-
- for (seq = min_seq[type]; seq <= max_seq; seq++) {
- gen = lru_gen_from_seq(seq);
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++)
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
- }
- }
+ /* try to avoid aging, do gentle reclaim at the default priority */
+ if (sc->priority == DEF_PRIORITY)
+ return false;
- *nr_to_scan = size;
/* better to run aging even though eviction is still possible */
return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
-/*
- * For future optimizations:
- * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
- * reclaim.
- */
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+ struct mem_cgroup *memcg, int swappiness)
{
- bool success;
- unsigned long nr_to_scan;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- DEFINE_MAX_SEQ(lruvec);
-
- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
- return -1;
+ unsigned long nr_to_scan, evictable;
- success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
+ evictable = lruvec_evictable_size(lruvec, swappiness);
/* try to scrape all its memory if this memcg was deleted */
- if (nr_to_scan && !mem_cgroup_online(memcg))
- return nr_to_scan;
+ if (!mem_cgroup_online(memcg))
+ return evictable;
- nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
+ nr_to_scan = apply_proportional_protection(memcg, sc, evictable);
+ nr_to_scan >>= sc->priority;
- /* try to get away with not aging at the default priority */
- if (!success || sc->priority == DEF_PRIORITY)
- return nr_to_scan >> sc->priority;
-
- /* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
+ return nr_to_scan;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4993,62 +4956,59 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
return true;
}
+/*
+ * For future optimizations:
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ * reclaim.
+ */
static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- long nr_to_scan;
- unsigned long scanned = 0;
+ bool need_rotate = false, should_age = false;
+ long nr_batch, nr_to_scan;
int swappiness = get_swappiness(lruvec, sc);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- while (true) {
+ nr_to_scan = get_nr_to_scan(lruvec, sc, memcg, swappiness);
+ while (nr_to_scan > 0) {
int delta;
+ DEFINE_MAX_SEQ(lruvec);
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
- if (nr_to_scan <= 0)
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) {
+ need_rotate = true;
break;
+ }
- delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
- if (!delta)
- break;
+ if (should_run_aging(lruvec, max_seq, sc, swappiness)) {
+ if (try_to_inc_max_seq(lruvec, max_seq, swappiness, false))
+ need_rotate = true;
+ should_age = true;
+ }
- scanned += delta;
- if (scanned >= nr_to_scan)
+ nr_batch = min(nr_to_scan, MIN_LRU_BATCH);
+ delta = evict_folios(nr_batch, lruvec, sc, swappiness);
+ if (!delta)
break;
if (should_abort_scan(lruvec, sc))
break;
- cond_resched();
- }
-
- /*
- * If too many file cache in the coldest generation can't be evicted
- * due to being dirty, wake up the flusher.
- */
- if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) {
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
- wakeup_flusher_threads(WB_REASON_VMSCAN);
-
/*
- * For cgroupv1 dirty throttling is achieved by waking up
- * the kernel flusher here and later waiting on folios
- * which are in writeback to finish (see shrink_folio_list()).
- *
- * Flusher may not be able to issue writeback quickly
- * enough for cgroupv1 writeback throttling to work
- * on a large system.
+ * Root reclaim needs rotation when low on cold folio for better
+ * fairness. Cgroup reclaim gets fairness from the iterator.
*/
- if (!writeback_throttling_sane(sc))
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ if (root_reclaim(sc) && should_age)
+ break;
+
+ nr_to_scan -= delta;
+ cond_resched();
}
- /* whether this lruvec should be rotated */
- return nr_to_scan < 0;
+ return need_rotate;
}
static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
{
- bool success;
+ bool need_rotate;
unsigned long scanned = sc->nr_scanned;
unsigned long reclaimed = sc->nr_reclaimed;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -5066,20 +5026,20 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
}
- success = try_to_shrink_lruvec(lruvec, sc);
+ need_rotate = try_to_shrink_lruvec(lruvec, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
+ sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
flush_reclaim_state(sc);
- if (success && mem_cgroup_online(memcg))
+ if (need_rotate && mem_cgroup_online(memcg))
return MEMCG_LRU_YOUNG;
- if (!success && lruvec_is_sizable(lruvec, sc))
+ if (!need_rotate && lruvec_is_sizable(lruvec, sc))
return 0;
/* one retry if offlined or too small */
@@ -5631,6 +5591,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq,
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
int swappiness, unsigned long nr_to_reclaim)
{
+ int nr_batch;
DEFINE_MAX_SEQ(lruvec);
if (seq + MIN_NR_GENS > max_seq)
@@ -5647,8 +5608,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
- swappiness))
+ nr_batch = min(nr_to_reclaim - sc->nr_reclaimed, MAX_LRU_BATCH);
+ if (!evict_folios(nr_batch, lruvec, sc, swappiness))
return 0;
cond_resched();
@@ -6175,7 +6136,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
/* Record the group's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false,
+ vmpressure(sc->gfp_mask, sc->order, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
@@ -6220,7 +6181,7 @@ again:
/* Record the subtree's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned, nr_node_reclaimed);
if (nr_node_reclaimed)
@@ -6359,7 +6320,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
if (current_is_kswapd() || cgroup_reclaim(sc))
return;
- /* Throttle if making no progress at high prioities. */
+ /* Throttle if making no progress at high priorities. */
if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
@@ -7053,7 +7014,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/*
* Fragmentation may mean that the system cannot be rebalanced for
- * high-order allocations. If twice the allocation size has been
+ * high-order allocations. If at least the compaction gap has been
* reclaimed then recheck watermarks only at order-0 to prevent
* excessive reclaim. Assume that a process requested a high-order
* can direct reclaim/compact.
@@ -7121,6 +7082,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.may_unmap = 1,
};
+ trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order,
+ highest_zoneidx);
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
@@ -7222,7 +7185,7 @@ restart:
/*
* There should be no need to raise the scanning priority if
- * enough pages are already being scanned that that high
+ * enough pages are already being scanned that the high
* watermark would be met at 100% efficiency.
*/
if (kswapd_shrink_node(pgdat, &sc))
@@ -7314,6 +7277,9 @@ out:
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
+ trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order,
+ highest_zoneidx, sc.nr_reclaimed);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/workingset.c b/mm/workingset.c
index 07e6836d0502..f351798e723a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -319,11 +319,13 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- /* see folio_add_lru() where folio_set_active() will be called */
- if (lru_gen_in_fault())
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
-
if (workingset) {
+ /*
+ * see folio_add_lru(), where folio_set_active() is
+ * called for workingset folios
+ */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
} else
diff --git a/mm/zswap.c b/mm/zswap.c
index 4b5149173b0e..761cd699e0a3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -991,7 +991,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
pgoff_t offset = swp_offset(swpentry);
struct folio *folio;
struct mempolicy *mpol;
- bool folio_was_allocated;
struct swap_info_struct *si;
int ret = 0;
@@ -1001,23 +1000,19 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return -EEXIST;
mpol = get_task_policy(current);
- folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated);
+ folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
+ NO_INTERLEAVE_INDEX);
put_swap_device(si);
- if (!folio)
- return -ENOMEM;
/*
- * Found an existing folio, we raced with swapin or concurrent
- * shrinker. We generally writeback cold folios from zswap, and
- * swapin means the folio just became hot, so skip this folio.
- * For unlikely concurrent shrinker case, it will be unlinked
- * and freed when invalidated by the concurrent shrinker anyway.
+ * Swap cache allocation might fail due to OOM, or the entry
+ * may already be cached due to concurrent swapin or have been
+ * freed. If already cached, a concurrent swapin made the folio
+ * hot, so skip it. For the unlikely concurrent shrinker case,
+ * it will be unlinked and freed when invalidated anyway.
*/
- if (!folio_was_allocated) {
- ret = -EEXIST;
- goto out;
- }
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* folio is locked, and the swapcache is now secured against
@@ -1057,7 +1052,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
__swap_writepage(folio, NULL);
out:
- if (ret && ret != -EEXIST) {
+ if (ret) {
swap_cache_del_folio(folio);
folio_unlock(folio);
}