diff options
Diffstat (limited to 'mm')
67 files changed, 7068 insertions, 2551 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4f187b07eb48..fe734d9bbe99 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -590,7 +590,7 @@ endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" - select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) + select HAVE_BOOTMEM_INFO_NODE if X86_64 depends on MEMORY_HOTPLUG select MIGRATION @@ -863,7 +863,6 @@ if TRANSPARENT_HUGEPAGE choice prompt "Transparent Hugepage Support sysfs defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_ALWAYS help Selects the sysfs defaults for Transparent Hugepage Support. @@ -893,7 +892,6 @@ endchoice choice prompt "Shmem hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -939,7 +937,6 @@ endchoice choice prompt "Tmpfs hugepage allocation defaults" - depends on TRANSPARENT_HUGEPAGE default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER help Selects the hugepage allocation policy defaults for @@ -984,7 +981,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT + depends on ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244e..eff9f9e7e061 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,9 +103,6 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_LIVEUPDATE_MEMFD) += memfd_luo.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -ifdef CONFIG_SWAP -obj-$(CONFIG_MEMCG) += swap_cgroup.o -endif ifdef CONFIG_BPF_SYSCALL obj-$(CONFIG_MEMCG) += bpf_memcontrol.o endif diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 3d7675a3ae04..0fa78db7fbc0 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -19,7 +19,6 @@ void get_page_bootmem(unsigned long info, struct page *page, { BUG_ON(type > 0xf); BUG_ON(info > (ULONG_MAX >> 4)); - SetPagePrivate(page); set_page_private(page, info << 4 | type); page_ref_inc(page); } @@ -32,20 +31,15 @@ void put_page_bootmem(struct page *page) type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { - ClearPagePrivate(page); set_page_private(page, 0); - INIT_LIST_HEAD(&page->lru); - kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } } static void __init register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long mapsize, section_nr, i; + unsigned long section_nr; struct mem_section *ms; - struct mem_section_usage *usage; - struct page *page; start_pfn = SECTION_ALIGN_DOWN(start_pfn); section_nr = pfn_to_section_nr(start_pfn); @@ -54,27 +48,12 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) if (!preinited_vmemmap_section(ms)) register_page_bootmem_memmap(section_nr, pfn_to_page(start_pfn), PAGES_PER_SECTION); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { - unsigned long i, pfn, end_pfn, nr_pages; + unsigned long pfn, end_pfn; int node = pgdat->node_id; - struct page *page; - - nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; - page = virt_to_page(pgdat); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); diff --git a/mm/compaction.c b/mm/compaction.c index 3648ce22c807..b776f35ad020 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1123,7 +1123,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * To minimise LRU disruption, the caller can indicate with * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages * it will be able to migrate without blocking - clean pages - * for the most part. PageWriteback would require blocking. + * for the most part. Writeback would require blocking. */ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio)) goto isolate_fail_put; @@ -2340,7 +2340,8 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, true) >= 0) + if (find_suitable_fallback(area, order, migratetype, true, NULL) + == FALLBACK_FOUND) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure @@ -2447,7 +2448,7 @@ bool compaction_suitable(struct zone *zone, int order, unsigned long watermark, /* Used by direct reclaimers */ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, - int alloc_flags) + int alloc_flags, gfp_t gfp_mask) { struct zone *zone; struct zoneref *z; @@ -2460,6 +2461,10 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, ac->highest_zoneidx, ac->nodemask) { unsigned long available; + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* * Do not consider all the reclaimable memory because we do not * want to trash just for a single high order allocation which diff --git a/mm/damon/core.c b/mm/damon/core.c index 3dbbbfdeff71..265d51ade25b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -13,10 +13,14 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/psi.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_choices.h> +/* for damon_get_folio() used by node eligible memory metrics */ +#include "ops-common.h" + #define CREATE_TRACE_POINTS #include <trace/events/damon.h> @@ -109,6 +113,103 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +struct damon_filter *damon_new_filter(enum damon_filter_type type, + bool matching, bool allow) +{ + struct damon_filter *filter; + + filter = kmalloc_obj(*filter); + if (!filter) + return NULL; + filter->type = type; + filter->matching = matching; + filter->allow = allow; + INIT_LIST_HEAD(&filter->list); + return filter; +} + +void damon_add_filter(struct damon_probe *p, struct damon_filter *f) +{ + list_add_tail(&f->list, &p->filters); +} + +static void damon_del_filter(struct damon_filter *f) +{ + list_del(&f->list); +} + +static void damon_free_filter(struct damon_filter *f) +{ + kfree(f); +} + +void damon_destroy_filter(struct damon_filter *f) +{ + damon_del_filter(f); + damon_free_filter(f); +} + +static struct damon_filter *damon_nth_filter(int n, struct damon_probe *p) +{ + struct damon_filter *f; + int i = 0; + + damon_for_each_filter(f, p) { + if (i++ == n) + return f; + } + return NULL; +} + +struct damon_probe *damon_new_probe(void) +{ + struct damon_probe *p; + + p = kmalloc_obj(*p); + if (!p) + return NULL; + INIT_LIST_HEAD(&p->filters); + INIT_LIST_HEAD(&p->list); + return p; +} + +void damon_add_probe(struct damon_ctx *ctx, struct damon_probe *probe) +{ + list_add_tail(&probe->list, &ctx->probes); +} + +static void damon_del_probe(struct damon_probe *p) +{ + list_del(&p->list); +} + +static void damon_free_probe(struct damon_probe *p) +{ + struct damon_filter *f, *next; + + damon_for_each_filter_safe(f, next, p) + damon_free_filter(f); + kfree(p); +} + +static void damon_destroy_probe(struct damon_probe *p) +{ + damon_del_probe(p); + damon_free_probe(p); +} + +static struct damon_probe *damon_nth_probe(int n, struct damon_ctx *ctx) +{ + struct damon_probe *p; + int i = 0; + + damon_for_each_probe(p, ctx) { + if (i++ == n) + return p; + } + return NULL; +} + #ifdef CONFIG_DAMON_DEBUG_SANITY static void damon_verify_new_region(unsigned long start, unsigned long end) { @@ -128,6 +229,7 @@ static void damon_verify_new_region(unsigned long start, unsigned long end) struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; + int i; damon_verify_new_region(start, end); region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); @@ -138,6 +240,8 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->ar.end = end; region->nr_accesses = 0; region->nr_accesses_bp = 0; + for (i = 0; i < DAMON_MAX_PROBES; i++) + region->probe_hits[i] = 0; INIT_LIST_HEAD(®ion->list); region->age = 0; @@ -146,12 +250,23 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -void damon_add_region(struct damon_region *r, struct damon_target *t) +static void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); t->nr_regions++; } +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next, + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + #ifdef CONFIG_DAMON_DEBUG_SANITY static void damon_verify_del_region(struct damon_target *t) { @@ -176,7 +291,8 @@ static void damon_free_region(struct damon_region *r) kmem_cache_free(damon_region_cache, r); } -void damon_destroy_region(struct damon_region *r, struct damon_target *t) +static void damon_destroy_region(struct damon_region *r, + struct damon_target *t) { damon_del_region(r, t); damon_free_region(r); @@ -252,11 +368,25 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + if (!damon_nr_regions(t)) { + for (i = 0; i < nr_ranges; i++) { + r = damon_new_region( + ALIGN_DOWN(ranges[i].start, + min_region_sz), + ALIGN(ranges[i].end, min_region_sz)); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + } + return 0; + } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; struct damon_addr_range *range; + bool insert_before_r = false; range = &ranges[i]; /* Get the first/last regions intersecting with the range */ @@ -266,8 +396,10 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first = r; last = r; } - if (r->ar.start >= range->end) + if (r->ar.start >= range->end) { + insert_before_r = true; break; + } } if (!first) { /* no region intersects with this range */ @@ -277,7 +409,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, ALIGN(range->end, min_region_sz)); if (!newr) return -ENOMEM; - damon_insert_region(newr, damon_prev_region(r), r, t); + if (insert_before_r) + damon_insert_region(newr, damon_prev_region(r), + r, t); + else + damon_add_region(newr, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, @@ -550,27 +686,8 @@ void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) damon_free_target(t); } -#ifdef CONFIG_DAMON_DEBUG_SANITY -static void damon_verify_nr_regions(struct damon_target *t) -{ - struct damon_region *r; - unsigned int count = 0; - - damon_for_each_region(r, t) - count++; - WARN_ONCE(count != t->nr_regions, "t->nr_regions (%u) != count (%u)\n", - t->nr_regions, count); -} -#else -static void damon_verify_nr_regions(struct damon_target *t) -{ -} -#endif - unsigned int damon_nr_regions(struct damon_target *t) { - damon_verify_nr_regions(t); - return t->nr_regions; } @@ -601,12 +718,16 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + INIT_LIST_HEAD(&ctx->probes); + ctx->addr_unit = 1; ctx->min_region_sz = DAMON_MIN_REGION_SZ; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); + prandom_seed_state(&ctx->rnd_state, get_random_u64()); + return ctx; } @@ -621,12 +742,16 @@ static void damon_destroy_targets(struct damon_ctx *ctx) void damon_destroy_ctx(struct damon_ctx *ctx) { struct damos *s, *next_s; + struct damon_probe *p, *next_p; damon_destroy_targets(ctx); damon_for_each_scheme_safe(s, next_s, ctx) damon_destroy_scheme(s); + damon_for_each_probe_safe(p, next_p, ctx) + damon_destroy_probe(p); + kfree(ctx); } @@ -797,6 +922,9 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) attrs->aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->passed_sample_intervals + attrs->ops_update_interval / sample_interval; + /* + * next_intervals_tune_sis will be updated inside kdamond_fn(). + */ damon_update_monitoring_results(ctx, attrs, aggregating); ctx->attrs = *attrs; @@ -918,6 +1046,8 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) if (err) return err; dst->goal_tuner = src->goal_tuner; + dst->fail_charge_num = src->fail_charge_num; + dst->fail_charge_denom = src->fail_charge_denom; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; @@ -1310,6 +1440,86 @@ static int damon_commit_targets( return 0; } +static void damon_commit_filter(struct damon_filter *dst, + struct damon_filter *src) +{ + dst->type = src->type; + dst->matching = src->matching; + dst->allow = src->allow; + switch (dst->type) { + case DAMON_FILTER_TYPE_MEMCG: + dst->memcg_id = src->memcg_id; + break; + default: + break; + } +} + +static int damon_commit_filters(struct damon_probe *dst, + struct damon_probe *src) +{ + struct damon_filter *dst_filter, *next, *src_filter, *new_filter; + int i = 0, j = 0; + + damon_for_each_filter_safe(dst_filter, next, dst) { + src_filter = damon_nth_filter(i++, src); + if (src_filter) + damon_commit_filter(dst_filter, src_filter); + else + damon_destroy_filter(dst_filter); + } + + damon_for_each_filter_safe(src_filter, next, src) { + if (j++ < i) + continue; + + new_filter = damon_new_filter(src_filter->type, + src_filter->matching, src_filter->allow); + if (!new_filter) + return -ENOMEM; + switch (src_filter->type) { + case DAMON_FILTER_TYPE_MEMCG: + new_filter->memcg_id = src_filter->memcg_id; + break; + default: + break; + } + damon_add_filter(dst, new_filter); + } + return 0; +} + +static int damon_commit_probes(struct damon_ctx *dst, struct damon_ctx *src) +{ + struct damon_probe *dst_probe, *next, *src_probe, *new_probe; + int i = 0, j = 0, err; + + damon_for_each_probe_safe(dst_probe, next, dst) { + src_probe = damon_nth_probe(i++, src); + if (src_probe) { + err = damon_commit_filters(dst_probe, src_probe); + if (err) + return err; + } else { + damon_destroy_probe(dst_probe); + } + } + + damon_for_each_probe_safe(src_probe, next, src) { + if (j++ < i) + continue; + + new_probe = damon_new_probe(); + if (!new_probe) + return -ENOMEM; + damon_add_probe(dst, new_probe); + err = damon_commit_filters(new_probe, src_probe); + if (err) + return err; + } + return 0; +} + /** * damon_commit_ctx() - Commit parameters of a DAMON context to another. * @dst: The commit destination DAMON context. @@ -1326,11 +1536,26 @@ static int damon_commit_targets( int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + struct damos *scheme; + struct damos_quota_goal *goal; dst->maybe_corrupted = true; if (!is_power_of_2(src->min_region_sz)) return -EINVAL; + /* node_eligible_mem_bp metric requires PADDR ops */ + if (src->ops.id != DAMON_OPS_PADDR) { + damon_for_each_scheme(scheme, src) { + struct damos_quota *quota = &scheme->quota; + + damos_for_each_quota_goal(goal, quota) { + if (goal->metric == + DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP) + return -EINVAL; + } + } + } + err = damon_commit_schemes(dst, src); if (err) return err; @@ -1349,7 +1574,11 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) if (err) return err; } + dst->pause = src->pause; dst->ops = src->ops; + err = damon_commit_probes(dst, src); + if (err) + return err; dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; @@ -1706,15 +1935,28 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; unsigned int ti = 0; /* target's index */ + unsigned int nr_probes = 0; + struct damon_probe *probe; + + if (trace_damon_region_aggregated_enabled()) { + damon_for_each_probe(probe, c) + nr_probes++; + } damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { + int i; + trace_damon_aggregated(ti, r, damon_nr_regions(t)); + trace_damon_region_aggregated(ti, r, + damon_nr_regions(t), nr_probes); damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; + for (i = 0; i < DAMON_MAX_PROBES; i++) + r->probe_hits[i] = 0; damon_verify_reset_aggregated(r, c); } ti++; @@ -2046,6 +2288,37 @@ static void damos_walk_cancel(struct damon_ctx *ctx) mutex_unlock(&ctx->walk_control_lock); } +static void damos_charge_quota(struct damos_quota *quota, + unsigned long sz_region, unsigned long sz_applied) +{ + /* + * sz_applied could be bigger than sz_region, depending on ops + * implementation of the action, e.g., damos_pa_pageout(). Charge only + * the region size in the case. + */ + if (!quota->fail_charge_denom || sz_applied > sz_region) + quota->charged_sz += sz_region; + else + quota->charged_sz += sz_applied + mult_frac( + (sz_region - sz_applied), + quota->fail_charge_num, + quota->fail_charge_denom); +} + +static bool damos_quota_is_full(struct damos_quota *quota, + unsigned long min_region_sz) +{ + if (!damos_quota_is_set(quota)) + return false; + if (quota->charged_sz >= quota->esz) + return true; + /* + * DAMOS action is applied per region, so <min_region_sz remaining + * quota means the quota is effectively full. + */ + return quota->esz - quota->charged_sz < min_region_sz; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -2102,11 +2375,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) { + damos_charge_quota(quota, sz, sz_applied); + if (damos_quota_is_full(quota, c->min_region_sz)) { quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; + quota->charge_addr_from = r->ar.end; } } if (s->action != DAMOS_STAT) @@ -2132,8 +2404,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Check the quota */ - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) continue; if (damos_skip_charged_region(t, r, s, c->min_region_sz)) @@ -2152,6 +2423,58 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } /* + * damos_apply_target() - Apply DAMOS schemes to a given target. + * @c: monitoring context to apply its DAMOS schemes to.. + * @t: monitoring target to apply the schemes to. + * @max_region_sz: maximum region size for @c. + * + * This function could split regions for keeping the quota. To minimize + * overhead from the split operations increased number of regions, this + * function will also merge regions after the schemes applying attempt is done, + * for each region. The merge operation is made only when it doesn't lose the + * monitoring information and not violating @max_region_sz. + * + * Hence, after this function is called, the total number of regions could + * be increased or reduced. The increase could make max_nr_regions temporarily + * be violated, until the next per-aggregation interval regions merge operation + * is executed. The decrease will not violate min_nr_regions though, since it + * keeps @max_region_sz. + */ +static void damos_apply_target(struct damon_ctx *c, struct damon_target *t, + unsigned long max_region_sz) +{ + struct damon_region *r; + + damon_for_each_region(r, t) { + struct damon_region *prev_r; + + damon_do_apply_schemes(c, t, r); + /* + * damon_do_apply_scheems() could split the region for the + * quota. Keeping the new slices is an overhead. Merge back + * the slices into the previous region if it doesn't lose any + * information and not violating the max_region_sz. + */ + if (damon_first_region(t) == r) + continue; + prev_r = damon_prev_region(r); + if (prev_r->ar.end != r->ar.start) + continue; + if (prev_r->age != r->age) + continue; + if (prev_r->last_nr_accesses != r->last_nr_accesses) + continue; + if (prev_r->nr_accesses != r->nr_accesses) + continue; + if (r->ar.end - prev_r->ar.start > max_region_sz) + continue; + prev_r->ar.end = r->ar.end; + damon_destroy_region(r, t); + r = prev_r; + } +} + +/* * damon_feed_loop_next_input() - get next input to achieve a target score. * @last_input The last input. * @score Current score that made with @last_input. @@ -2287,7 +2610,115 @@ static unsigned long damos_get_node_memcg_used_bp( numerator = i.totalram - used_pages; return mult_frac(numerator, 10000, i.totalram); } -#else + +#ifdef CONFIG_DAMON_PADDR +/* + * damos_calc_eligible_bytes() - Calculate raw eligible bytes per node. + * @c: The DAMON context. + * @s: The scheme. + * @nid: The target NUMA node id. + * @total: Output for total eligible bytes across all nodes. + * + * Iterates through each folio in eligible regions to accurately determine + * which node the memory resides on. Returns eligible bytes on the specified + * node and sets *total to the sum across all nodes. + * + * Note: This function requires damon_get_folio() from ops-common.c, which is + * only available when CONFIG_DAMON_PADDR is enabled. It also requires the + * context to be using PADDR operations for meaningful results. + */ +static phys_addr_t damos_calc_eligible_bytes(struct damon_ctx *c, + struct damos *s, int nid, phys_addr_t *total) +{ + struct damon_target *t; + struct damon_region *r; + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible = 0; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + phys_addr_t addr, end_addr; + + if (!__damos_valid_target(r, s)) + continue; + + /* Convert from core address units to physical bytes */ + addr = (phys_addr_t)r->ar.start * c->addr_unit; + end_addr = (phys_addr_t)r->ar.end * c->addr_unit; + while (addr < end_addr) { + struct folio *folio; + phys_addr_t folio_start, folio_end; + phys_addr_t overlap_start, overlap_end; + phys_addr_t counted; + + folio = damon_get_folio(PHYS_PFN(addr)); + if (!folio) { + addr = PAGE_ALIGN_DOWN(addr + + PAGE_SIZE); + if (!addr) + break; + continue; + } + + /* + * Calculate exact overlap between the region + * [addr, end_addr) and the folio range. + * The folio may start before addr if addr is + * in the middle of a large folio. + */ + folio_start = PFN_PHYS(folio_pfn(folio)); + folio_end = folio_start + folio_size(folio); + + overlap_start = max(addr, folio_start); + overlap_end = min(end_addr, folio_end); + + if (overlap_end > overlap_start) { + counted = overlap_end - overlap_start; + total_eligible += counted; + if (folio_nid(folio) == nid) + node_eligible += counted; + } + + /* Advance past the entire folio */ + addr = folio_end; + folio_put(folio); + } + cond_resched(); + } + } + + *total = total_eligible; + return node_eligible; +} + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + phys_addr_t total_eligible = 0; + phys_addr_t node_eligible; + + if (c->ops.id != DAMON_OPS_PADDR) + return 0; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)) + return 0; + + node_eligible = damos_calc_eligible_bytes(c, s, nid, &total_eligible); + + if (!(unsigned long)total_eligible) + return 0; + + return mult_frac((unsigned long)node_eligible, 10000, + (unsigned long)total_eligible); +} +#else /* CONFIG_DAMON_PADDR */ +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_DAMON_PADDR */ +#else /* CONFIG_NUMA */ static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { @@ -2299,7 +2730,13 @@ static unsigned long damos_get_node_memcg_used_bp( { return 0; } -#endif + +static unsigned long damos_get_node_eligible_mem_bp(struct damon_ctx *c, + struct damos *s, int nid) +{ + return 0; +} +#endif /* CONFIG_NUMA */ /* * Returns LRU-active or inactive memory to total LRU memory size ratio. @@ -2319,7 +2756,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio) return mult_frac(inactive, 10000, total); } -static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) +static void damos_set_quota_goal_current_value(struct damon_ctx *c, + struct damos *s, struct damos_quota_goal *goal) { u64 now_psi_total; @@ -2345,19 +2783,24 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_in_active_mem_bp( goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->current_value = damos_get_node_eligible_mem_bp(c, s, + goal->nid); + break; default: break; } } /* Return the highest score since it makes schemes least aggressive */ -static unsigned long damos_quota_score(struct damos_quota *quota) +static unsigned long damos_quota_score(struct damon_ctx *c, struct damos *s) { struct damos_quota_goal *goal; + struct damos_quota *quota = &s->quota; unsigned long highest_score = 0; damos_for_each_quota_goal(goal, quota) { - damos_set_quota_goal_current_value(goal); + damos_set_quota_goal_current_value(c, s, goal); highest_score = max(highest_score, mult_frac(goal->current_value, 10000, goal->target_value)); @@ -2366,17 +2809,20 @@ static unsigned long damos_quota_score(struct damos_quota *quota) return highest_score; } -static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_consist(struct damon_ctx *c, struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), score); } -static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +static void damos_goal_tune_esz_bp_temporal(struct damon_ctx *c, + struct damos *s) { - unsigned long score = damos_quota_score(quota); + struct damos_quota *quota = &s->quota; + unsigned long score = damos_quota_score(c, s); if (score >= 10000) quota->esz_bp = 0; @@ -2389,9 +2835,9 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota, - struct damon_ctx *ctx) +static void damos_set_effective_quota(struct damon_ctx *ctx, struct damos *s) { + struct damos_quota *quota = &s->quota; unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2402,9 +2848,9 @@ static void damos_set_effective_quota(struct damos_quota *quota, if (!list_empty("a->goals)) { if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) - damos_goal_tune_esz_bp_consist(quota); + damos_goal_tune_esz_bp_consist(ctx, s); else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) - damos_goal_tune_esz_bp_temporal(quota); + damos_goal_tune_esz_bp_temporal(ctx, s); esz = quota->esz_bp / 10000; } @@ -2452,22 +2898,23 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); + if (trace_damos_esz_enabled()) + damos_trace_esz(c, s, quota); } /* New charge window starts */ if (!time_in_range_open(jiffies, quota->charged_from, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { - if (damos_quota_is_set(quota) && - quota->charged_sz >= quota->esz) + if (damos_quota_is_full(quota, c->min_region_sz)) s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota, c); + damos_set_effective_quota(c, s); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } @@ -2521,9 +2968,9 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s) static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r; struct damos *s; bool has_schemes_to_apply = false; + unsigned long max_region_sz; damon_for_each_scheme(s, c) { if (time_before(c->passed_sample_intervals, s->next_apply_sis)) @@ -2540,13 +2987,12 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + max_region_sz = damon_region_sz_limit(c); mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { if (c->ops.target_valid && c->ops.target_valid(t) == false) continue; - - damon_for_each_region(r, t) - damon_do_apply_schemes(c, t, r); + damos_apply_target(c, t, max_region_sz); } damon_for_each_scheme(s, c) { @@ -2582,12 +3028,17 @@ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); + int i; l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; + /* todo: do this for only installed probes */ + for (i = 0; i < DAMON_MAX_PROBES; i++) + l->probe_hits[i] = (l->probe_hits[i] * sz_l + r->probe_hits[i] + * sz_r) / (sz_l + sz_r); damon_verify_merge_two_regions(l, r); damon_destroy_region(r, t); } @@ -2710,13 +3161,16 @@ static void damon_split_region_at(struct damon_target *t, new->last_nr_accesses = r->last_nr_accesses; new->nr_accesses_bp = r->nr_accesses_bp; new->nr_accesses = r->nr_accesses; + /* todo: do this for only installed probes */ + memcpy(new->probe_hits, r->probe_hits, sizeof(r->probe_hits)); damon_insert_region(new, r, damon_next_region(r), t); } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs, - unsigned long min_region_sz) +static void damon_split_regions_of(struct damon_ctx *ctx, + struct damon_target *t, int nr_subs, + unsigned long min_region_sz) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2731,7 +3185,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs, * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ - sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_sub = ALIGN_DOWN(damon_rand(ctx, 1, 10) * sz_region / 10, min_region_sz); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) @@ -2772,7 +3226,8 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions, ctx->min_region_sz); + damon_split_regions_of(ctx, t, nr_subregions, + ctx->min_region_sz); last_nr_regions = nr_regions; } @@ -2857,6 +3312,37 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_ctx(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, c) { + struct damon_region *prev_r = NULL; + unsigned int nr_regions = 0; + + damon_for_each_region(r, t) { + WARN_ONCE(r->ar.start >= r->ar.end, + "region start (%lu) >= end (%lu)\n", + r->ar.start, r->ar.end); + WARN_ONCE(prev_r && prev_r->ar.end > r->ar.start, + "region overlap (%lu > %lu)\n", + prev_r->ar.end, r->ar.start); + prev_r = r; + nr_regions++; + } + WARN_ONCE(damon_nr_regions(t) != nr_regions, + "nr_regions mismatch: %u != %u\n", + damon_nr_regions(t), nr_regions); + } +} +#else +static void damon_verify_ctx(struct damon_ctx *c) +{ +} +#endif + /* * kdamond_call() - handle damon_call_control objects. * @ctx: The &struct damon_ctx of the kdamond. @@ -2872,6 +3358,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) struct damon_call_control *control, *next; LIST_HEAD(controls); + damon_verify_ctx(ctx); + mutex_lock(&ctx->call_controls_lock); list_splice_tail_init(&ctx->call_controls, &controls); mutex_unlock(&ctx->call_controls_lock); @@ -2997,6 +3485,8 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); + if (ctx->ops.apply_probes) + ctx->ops.apply_probes(ctx); if (time_after_eq(ctx->passed_sample_intervals, next_aggregation_sis)) { @@ -3014,6 +3504,14 @@ static int kdamond_fn(void *data) kdamond_call(ctx, false); if (ctx->maybe_corrupted) break; + while (ctx->pause) { + damos_walk_cancel(ctx); + kdamond_usleep(ctx->attrs.sample_interval); + /* allow caller unset pause via damon_call() */ + kdamond_call(ctx, false); + if (kdamond_need_stop(ctx) || ctx->maybe_corrupted) + goto done; + } if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else @@ -3096,14 +3594,20 @@ done: return 0; } -static int walk_system_ram(struct resource *res, void *arg) +struct damon_system_ram_range_walk_arg { + bool walked; + struct resource res; +}; + +static int damon_system_ram_walk_fn(struct resource *res, void *arg) { - struct resource *a = arg; + struct damon_system_ram_range_walk_arg *a = arg; - if (resource_size(a) < resource_size(res)) { - a->start = res->start; - a->end = res->end; + if (!a->walked) { + a->walked = true; + a->res.start = res->start; } + a->res.end = res->end; return 0; } @@ -3120,27 +3624,24 @@ static unsigned long damon_res_to_core_addr(resource_size_t ra, return ra / addr_unit; } -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool damon_find_biggest_system_ram(unsigned long *start, +static bool damon_find_system_rams_range(unsigned long *start, unsigned long *end, unsigned long addr_unit) - { - struct resource res = {}; + struct damon_system_ram_range_walk_arg arg = {}; - walk_system_ram_res(0, -1, &res, walk_system_ram); - *start = damon_res_to_core_addr(res.start, addr_unit); - *end = damon_res_to_core_addr(res.end + 1, addr_unit); + walk_system_ram_res(0, -1, &arg, damon_system_ram_walk_fn); + if (!arg.walked) + return false; + *start = damon_res_to_core_addr(arg.res.start, addr_unit); + *end = damon_res_to_core_addr(arg.res.end + 1, addr_unit); if (*end <= *start) return false; return true; } /** - * damon_set_region_biggest_system_ram_default() - Set the region of the given - * monitoring target as requested, or biggest 'System RAM'. + * damon_set_region_system_rams_default() - Set the region of the given + * monitoring target as requested, or to cover all 'System RAM' resources. * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. @@ -3148,14 +3649,14 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the - * values of @start and @end are zero, however, this function finds the biggest - * 'System RAM' resource and sets the region to cover the resource. In the - * latter case, this function saves the start and end addresses of the resource - * in @start and @end, respectively. + * values of @start and @end are zero, however, this function finds 'System + * RAM' resources and sets the region to cover all the resource. In the latter + * case, this function saves the start and the end addresseses of the first and + * the last resources in @start and @end, respectively. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_region_biggest_system_ram_default(struct damon_target *t, +int damon_set_region_system_rams_default(struct damon_target *t, unsigned long *start, unsigned long *end, unsigned long addr_unit, unsigned long min_region_sz) { @@ -3165,7 +3666,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end, addr_unit)) + !damon_find_system_rams_range(start, end, addr_unit)) return -EINVAL; addr_range.start = *start; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8cfe7bd3dc1d..8298c6001fd0 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * the re-reading, DAMON_LRU_SORT will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Desired active to [in]active memory ratio in bp (1/10,000). @@ -140,7 +139,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -149,7 +149,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_LRU_SORT will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -285,6 +286,11 @@ static int damon_lru_sort_apply_parameters(void) param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + if (!is_power_of_2(param_ctx->min_region_sz)) { + err = -EINVAL; + goto out; + } + if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; goto out; @@ -327,7 +333,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) goto out; - err = damon_set_region_biggest_system_ram_default(param_target, + err = damon_set_region_system_rams_default(param_target, &monitor_region_start, &monitor_region_end, param_ctx->addr_unit, @@ -340,18 +346,51 @@ out: return err; } -static int damon_lru_sort_handle_commit_inputs(void) +static int damon_lru_sort_commit_inputs_fn(void *arg) +{ + return damon_lru_sort_apply_parameters(); +} + +static int damon_lru_sort_commit_inputs_store(const char *val, + const struct kernel_param *kp) { + bool commit_inputs_request; int err; + struct damon_call_control control = { + .fn = damon_lru_sort_commit_inputs_fn, + }; + + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } - if (!commit_inputs) + if (!commit_inputs_request) return 0; - err = damon_lru_sort_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_lru_sort_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_lru_sort_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -365,7 +404,7 @@ static int damon_lru_sort_damon_call_fn(void *arg) damon_lru_sort_cold_stat = s->stat; } - return damon_lru_sort_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index c3e4c871b0bb..5c93ef2bb8a9 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -117,9 +117,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, damon_max_nr_accesses(&c->attrs); age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; + if (age_in_sec) + age_in_log = min_t(int, ilog2(age_in_sec) + 1, + DAMON_MAX_AGE_IN_LOG); + else + age_in_log = 0; + /* If frequency is 0, higher age means it's colder */ if (freq_subscore == 0) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5cdcc5037cbc..d0598f5f2688 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -49,11 +49,11 @@ static void damon_pa_mkold(phys_addr_t paddr) } static void __damon_pa_prepare_access_check(struct damon_region *r, - unsigned long addr_unit) + struct damon_ctx *ctx) { - r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end); - damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit)); + damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, ctx->addr_unit)); } static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) @@ -63,7 +63,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(r, ctx->addr_unit); + __damon_pa_prepare_access_check(r, ctx); } } @@ -120,6 +120,81 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } +static bool damon_pa_filter_match(struct damon_filter *filter, + struct folio *folio) +{ + bool matched = false; + struct mem_cgroup *memcg; + + switch (filter->type) { + case DAMON_FILTER_TYPE_ANON: + if (!folio) { + matched = false; + break; + } + matched = folio_test_anon(folio); + break; + case DAMON_FILTER_TYPE_MEMCG: + if (!folio) { + matched = false; + break; + } + rcu_read_lock(); + memcg = folio_memcg_check(folio); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + default: + break; + } + return matched == filter->matching; +} + +static bool damon_pa_filter_pass(phys_addr_t pa, struct folio *folio, + struct damon_probe *p) +{ + struct damon_filter *f; + bool pass = true; + + damon_for_each_filter(f, p) { + if (damon_pa_filter_match(f, folio)) { + pass = f->allow; + break; + } + pass = !f->allow; + } + return pass; +} + +static void damon_pa_apply_probes(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + struct damon_probe *p; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + int i = 0; + phys_addr_t pa; + struct folio *folio; + + pa = damon_pa_phys_addr(r->sampling_addr, + ctx->addr_unit); + folio = damon_get_folio(PHYS_PFN(pa)); + damon_for_each_probe(p, ctx) { + if (damon_pa_filter_pass(pa, folio, p)) + r->probe_hits[i]++; + i++; + } + if (folio) + folio_put(folio); + } + } +} + /* * damos_pa_filter_out - Return true if the page should be filtered out. */ @@ -371,6 +446,7 @@ static int __init damon_pa_initcall(void) .update = NULL, .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, + .apply_probes = damon_pa_apply_probes, .target_valid = NULL, .apply_scheme = damon_pa_apply_scheme, .get_scheme_score = damon_pa_scheme_score, diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 96f6dfc28eae..ce4499cf4b8b 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -39,7 +39,6 @@ static bool enabled __read_mostly; * re-reading, DAMON_RECLAIM will be disabled. */ static bool commit_inputs __read_mostly; -module_param(commit_inputs, bool, 0600); /* * Time threshold for cold memory regions identification in microseconds. @@ -92,6 +91,20 @@ module_param(quota_mem_pressure_us, ulong, 0600); static unsigned long quota_autotune_feedback __read_mostly; module_param(quota_autotune_feedback, ulong, 0600); +/* + * Auto-tune monitoring intervals. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM automatically tunes DAMON's + * sampling and aggregation intervals. The auto-tuning aims to capture + * meaningful amount of access events in each DAMON-snapshot, while keeping the + * sampling intervals 5 milliseconds in minimum, and 10 seconds in maximum. + * Setting this as ``N`` disables the auto-tuning. + * + * Disabled by default. + */ +static bool autotune_monitoring_intervals __read_mostly; +module_param(autotune_monitoring_intervals, bool, 0600); + static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ @@ -114,7 +127,8 @@ DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); * Start of the target memory region in physical address. * * The start physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_start __read_mostly; module_param(monitor_region_start, ulong, 0600); @@ -123,7 +137,8 @@ module_param(monitor_region_start, ulong, 0600); * End of the target memory region in physical address. * * The end physical address of memory region that DAMON_RECLAIM will do work - * against. By default, biggest System RAM is used as the region. + * against. By default, the system's entire physical memory is used as the + * region. */ static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); @@ -151,7 +166,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, static struct damon_ctx *ctx; static struct damon_target *target; -static struct damos *damon_reclaim_new_scheme(void) +static struct damos *damon_reclaim_new_scheme(unsigned long aggr_interval) { struct damos_access_pattern pattern = { /* Find regions having PAGE_SIZE or larger size */ @@ -161,8 +176,7 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / - damon_reclaim_mon_attrs.aggr_interval, + .min_age_region = min_age / aggr_interval, .max_age_region = UINT_MAX, }; @@ -183,6 +197,7 @@ static int damon_reclaim_apply_parameters(void) { struct damon_ctx *param_ctx; struct damon_target *param_target; + struct damon_attrs attrs; struct damos *scheme; struct damos_quota_goal *goal; struct damos_filter *filter; @@ -195,17 +210,31 @@ static int damon_reclaim_apply_parameters(void) param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); + if (!is_power_of_2(param_ctx->min_region_sz)) { + err = -EINVAL; + goto out; + } + if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; goto out; } - err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); + attrs = damon_reclaim_mon_attrs; + if (autotune_monitoring_intervals) { + attrs.sample_interval = 5000; + attrs.aggr_interval = 100000; + attrs.intervals_goal.access_bp = 40; + attrs.intervals_goal.aggrs = 3; + attrs.intervals_goal.min_sample_us = 5000; + attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000; + } + err = damon_set_attrs(param_ctx, &attrs); if (err) goto out; err = -ENOMEM; - scheme = damon_reclaim_new_scheme(); + scheme = damon_reclaim_new_scheme(attrs.aggr_interval); if (!scheme) goto out; damon_set_schemes(param_ctx, &scheme, 1); @@ -233,11 +262,9 @@ static int damon_reclaim_apply_parameters(void) damos_add_filter(scheme, filter); } - err = damon_set_region_biggest_system_ram_default(param_target, - &monitor_region_start, - &monitor_region_end, - param_ctx->addr_unit, - param_ctx->min_region_sz); + err = damon_set_region_system_rams_default(param_target, + &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); @@ -246,18 +273,51 @@ out: return err; } -static int damon_reclaim_handle_commit_inputs(void) +static int damon_reclaim_commit_inputs_fn(void *arg) { + return damon_reclaim_apply_parameters(); +} + +static int damon_reclaim_commit_inputs_store(const char *val, + const struct kernel_param *kp) +{ + bool commit_inputs_request; int err; + struct damon_call_control control = { + .fn = damon_reclaim_commit_inputs_fn, + }; - if (!commit_inputs) + if (!val) { + commit_inputs_request = true; + } else { + err = kstrtobool(val, &commit_inputs_request); + if (err) + return err; + } + + if (!commit_inputs_request) return 0; - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - return err; + /* + * Skip damon_call() if ctx is not initialized to avoid + * NULL pointer dereference. + */ + if (!ctx) + return -EINVAL; + + err = damon_call(ctx, &control); + + return err ? err : control.return_code; } +static const struct kernel_param_ops commit_inputs_param_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = damon_reclaim_commit_inputs_store, + .get = param_get_bool, +}; + +module_param_cb(commit_inputs, &commit_inputs_param_ops, &commit_inputs, 0600); + static int damon_reclaim_damon_call_fn(void *arg) { struct damon_ctx *c = arg; @@ -267,7 +327,7 @@ static int damon_reclaim_damon_call_fn(void *arg) damon_for_each_scheme(s, c) damon_reclaim_stat = s->stat; - return damon_reclaim_handle_commit_inputs(); + return 0; } static struct damon_call_control call_control = { diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 3951b762cbdd..0e14f5bb8f75 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -148,59 +148,12 @@ static int damon_stat_damon_call_fn(void *data) return 0; } -struct damon_stat_system_ram_range_walk_arg { - bool walked; - struct resource res; -}; - -static int damon_stat_system_ram_walk_fn(struct resource *res, void *arg) -{ - struct damon_stat_system_ram_range_walk_arg *a = arg; - - if (!a->walked) { - a->walked = true; - a->res.start = res->start; - } - a->res.end = res->end; - return 0; -} - -static unsigned long damon_stat_res_to_core_addr(resource_size_t ra, - unsigned long addr_unit) -{ - /* - * Use div_u64() for avoiding linking errors related with __udivdi3, - * __aeabi_uldivmod, or similar problems. This should also improve the - * performance optimization (read div_u64() comment for the detail). - */ - if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) - return div_u64(ra, addr_unit); - return ra / addr_unit; -} - -static int damon_stat_set_monitoring_region(struct damon_target *t, - unsigned long addr_unit, unsigned long min_region_sz) -{ - struct damon_addr_range addr_range; - struct damon_stat_system_ram_range_walk_arg arg = {}; - - walk_system_ram_res(0, -1, &arg, damon_stat_system_ram_walk_fn); - if (!arg.walked) - return -EINVAL; - addr_range.start = damon_stat_res_to_core_addr( - arg.res.start, addr_unit); - addr_range.end = damon_stat_res_to_core_addr( - arg.res.end + 1, addr_unit); - if (addr_range.end <= addr_range.start) - return -EINVAL; - return damon_set_regions(t, &addr_range, 1, min_region_sz); -} - static struct damon_ctx *damon_stat_build_ctx(void) { struct damon_ctx *ctx; struct damon_attrs attrs; struct damon_target *target; + unsigned long start = 0, end = 0; ctx = damon_new_ctx(); if (!ctx) @@ -230,8 +183,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_stat_set_monitoring_region(target, ctx->addr_unit, - ctx->min_region_sz)) + if (damon_set_region_system_rams_default(target, &start, &end, + ctx->addr_unit, ctx->min_region_sz)) goto free_out; return ctx; free_out: @@ -313,6 +266,45 @@ static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp) return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N'); } +static int damon_stat_kdamond_pid_store( + const char *val, const struct kernel_param *kp) +{ + /* + * kdamond_pid is read-only, but kernel command line could write it. + * Do nothing here. + */ + return 0; +} + +static int damon_stat_kdamond_pid_load( + char *buffer, const struct kernel_param *kp) +{ + int pid; + + if (!damon_stat_context) { + pid = -1; + } else { + pid = damon_kdamond_pid(damon_stat_context); + if (pid < 1) + pid = -1; + } + return sprintf(buffer, "%d\n", pid); +} + +static const struct kernel_param_ops kdamond_pid_param_ops = { + .set = damon_stat_kdamond_pid_store, + .get = damon_stat_kdamond_pid_load, +}; + +/* + * PID of the DAMON thread + * + * If DAMON_STAT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400); +MODULE_PARM_DESC(kdamond_pid, "pid of the kdamond"); + static int __init damon_stat_init(void) { int err = 0; diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 83e24a9b5a0d..bdc6ae2639e4 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -104,3 +104,44 @@ const struct kobj_type damon_sysfs_ul_range_ktype = { .default_groups = damon_sysfs_ul_range_groups, }; + +static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, + char *memcg_path_buf, char *path) +{ +#ifdef CONFIG_MEMCG + cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); + if (sysfs_streq(memcg_path_buf, path)) + return true; +#endif /* CONFIG_MEMCG */ + return false; +} + +int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) +{ + struct mem_cgroup *memcg; + char *path; + bool found = false; + + if (!memcg_path) + return -EINVAL; + + path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); + if (!path) + return -ENOMEM; + + for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; + memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + /* skip offlined memcg */ + if (!mem_cgroup_online(memcg)) + continue; + if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { + *id = mem_cgroup_id(memcg); + found = true; + mem_cgroup_iter_break(NULL, memcg); + break; + } + } + + kfree(path); + return found ? 0 : -EINVAL; +} diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 2099adee11d0..3079306966a9 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -59,3 +59,5 @@ int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, void damos_sysfs_update_effective_quotas( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a8014780edae..329cfd0bbe9f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -11,6 +11,140 @@ #include "sysfs-common.h" /* + * probe directory + */ + +struct damos_sysfs_probe { + struct kobject kobj; + unsigned char hits; +}; + +static struct damos_sysfs_probe *damos_sysfs_probe_alloc(unsigned char hits) +{ + struct damos_sysfs_probe *probe; + + probe = kzalloc_obj(*probe); + if (!probe) + return NULL; + probe->hits = hits; + return probe; +} + +static ssize_t hits_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damos_sysfs_probe *probe = container_of(kobj, + struct damos_sysfs_probe, kobj); + + return sysfs_emit(buf, "%hhu\n", probe->hits); +} + +static void damos_sysfs_probe_release(struct kobject *kobj) +{ + struct damos_sysfs_probe *probe = container_of(kobj, + struct damos_sysfs_probe, kobj); + + kfree(probe); +} + +static struct kobj_attribute damos_sysfs_probe_hits_attr = + __ATTR_RO_MODE(hits, 0400); + +static struct attribute *damos_sysfs_probe_attrs[] = { + &damos_sysfs_probe_hits_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damos_sysfs_probe); + +static const struct kobj_type damos_sysfs_probe_ktype = { + .release = damos_sysfs_probe_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damos_sysfs_probe_groups, +}; + +/* + * probes directory + */ + +struct damos_sysfs_probes { + struct kobject kobj; + struct damos_sysfs_probe **probes_arr; + int nr; +}; + +static struct damos_sysfs_probes *damos_sysfs_probes_alloc(void) +{ + return kzalloc_obj(struct damos_sysfs_probes); +} + +static void damos_sysfs_probes_rm_dirs(struct damos_sysfs_probes *probes) +{ + struct damos_sysfs_probe **probes_arr = probes->probes_arr; + int i; + + for (i = 0; i < probes->nr; i++) + kobject_put(&probes_arr[i]->kobj); + probes->nr = 0; + kfree(probes_arr); + probes->probes_arr = NULL; +} + +static int damos_sysfs_probes_add_dirs(struct damos_sysfs_probes *probes, + struct damon_ctx *ctx, struct damon_region *region) +{ + struct damon_probe *probe; + struct damos_sysfs_probe **probes_arr; + int i = 0; + + damon_for_each_probe(probe, ctx) + i++; + + if (!i) + return 0; + + probes_arr = kmalloc_objs(*probes_arr, i); + if (!probes_arr) + return -ENOMEM; + probes->probes_arr = probes_arr; + + i = 0; + damon_for_each_probe(probe, ctx) { + struct damos_sysfs_probe *sys_probe; + int err; + + sys_probe = damos_sysfs_probe_alloc(region->probe_hits[i]); + if (!sys_probe) { + damos_sysfs_probes_rm_dirs(probes); + return -ENOMEM; + } + err = kobject_init_and_add(&sys_probe->kobj, + &damos_sysfs_probe_ktype, &probes->kobj, "%d", + i); + if (err) { + kobject_put(&sys_probe->kobj); + damos_sysfs_probes_rm_dirs(probes); + return err; + } + probes_arr[i++] = sys_probe; + probes->nr++; + } + return 0; +} + +static void damos_sysfs_probes_release(struct kobject *kobj) +{ + struct damos_sysfs_probes *probes = container_of(kobj, + struct damos_sysfs_probes, kobj); + + kfree(probes); +} + +static const struct kobj_type damos_sysfs_probes_ktype = { + .release = damos_sysfs_probes_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +/* * scheme region directory */ @@ -20,6 +154,7 @@ struct damon_sysfs_scheme_region { unsigned int nr_accesses; unsigned int age; unsigned long sz_filter_passed; + struct damos_sysfs_probes *probes; struct list_head list; }; @@ -34,10 +169,44 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( sysfs_region->ar = region->ar; sysfs_region->nr_accesses = region->nr_accesses_bp / 10000; sysfs_region->age = region->age; + sysfs_region->probes = NULL; INIT_LIST_HEAD(&sysfs_region->list); return sysfs_region; } +static int damos_sysfs_region_add_dirs( + struct damon_sysfs_scheme_region *region, + struct damon_ctx *ctx, + struct damon_region *dregion) +{ + struct damos_sysfs_probes *probes = damos_sysfs_probes_alloc(); + int err; + + if (!probes) + return -ENOMEM; + err = kobject_init_and_add(&probes->kobj, &damos_sysfs_probes_ktype, + ®ion->kobj, "probes"); + if (err) + goto fail; + err = damos_sysfs_probes_add_dirs(probes, ctx, dregion); + if (err) + goto fail; + + region->probes = probes; + return 0; + +fail: + kobject_put(&probes->kobj); + return err; +} + +static void damos_sysfs_region_rm_dirs( + struct damon_sysfs_scheme_region *region) +{ + damos_sysfs_probes_rm_dirs(region->probes); + kobject_put(®ion->probes->kobj); +} + static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -163,6 +332,7 @@ static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_region *r, *next; list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + damos_sysfs_region_rm_dirs(r); list_del(&r->list); kobject_put(&r->kobj); regions->nr_regions--; @@ -1093,6 +1263,10 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_INACTIVE_MEM_BP, .name = "inactive_mem_bp", }, + { + .metric = DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP, + .name = "node_eligible_mem_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -1508,6 +1682,8 @@ struct damon_sysfs_quotas { unsigned long reset_interval_ms; unsigned long effective_sz; /* Effective size quota in bytes */ enum damos_quota_goal_tuner goal_tuner; + unsigned int fail_charge_num; + unsigned int fail_charge_denom; }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1682,6 +1858,48 @@ static ssize_t goal_tuner_store(struct kobject *kobj, return -EINVAL; } +static ssize_t fail_charge_num_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_num); +} + +static ssize_t fail_charge_num_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_num); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t fail_charge_denom_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%u\n", quotas->fail_charge_denom); +} + +static ssize_t fail_charge_denom_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtouint(buf, 0, "as->fail_charge_denom); + + if (err) + return -EINVAL; + return count; +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1702,12 +1920,20 @@ static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr = __ATTR_RW_MODE(goal_tuner, 0600); +static struct kobj_attribute damon_sysfs_quotas_fail_charge_num_attr = + __ATTR_RW_MODE(fail_charge_num, 0600); + +static struct kobj_attribute damon_sysfs_quotas_fail_charge_denom_attr = + __ATTR_RW_MODE(fail_charge_denom, 0600); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, &damon_sysfs_quotas_effective_bytes_attr.attr, &damon_sysfs_quotas_goal_tuner_attr.attr, + &damon_sysfs_quotas_fail_charge_num_attr.attr, + &damon_sysfs_quotas_fail_charge_denom_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -2061,6 +2287,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = { .name = "nohugepage", }, { + .action = DAMOS_COLLAPSE, + .name = "collapse", + }, + { .action = DAMOS_LRU_PRIO, .name = "lru_prio", }, @@ -2561,47 +2791,6 @@ const struct kobj_type damon_sysfs_schemes_ktype = { .default_groups = damon_sysfs_schemes_groups, }; -static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, - char *memcg_path_buf, char *path) -{ -#ifdef CONFIG_MEMCG - cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); - if (sysfs_streq(memcg_path_buf, path)) - return true; -#endif /* CONFIG_MEMCG */ - return false; -} - -static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) -{ - struct mem_cgroup *memcg; - char *path; - bool found = false; - - if (!memcg_path) - return -EINVAL; - - path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); - if (!path) - return -ENOMEM; - - for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; - memcg = mem_cgroup_iter(NULL, memcg, NULL)) { - /* skip offlined memcg */ - if (!mem_cgroup_online(memcg)) - continue; - if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { - *id = mem_cgroup_id(memcg); - found = true; - mem_cgroup_iter_break(NULL, memcg); - break; - } - } - - kfree(path); - return found ? 0 : -EINVAL; -} - static int damon_sysfs_add_scheme_filters(struct damos *scheme, struct damon_sysfs_scheme_filters *sysfs_filters) { @@ -2685,6 +2874,9 @@ static int damos_sysfs_add_quota_score( } goal->nid = sysfs_goal->nid; break; + case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP: + goal->nid = sysfs_goal->nid; + break; default: break; } @@ -2796,6 +2988,8 @@ static struct damos *damon_sysfs_mk_scheme( .weight_nr_accesses = sysfs_weights->nr_accesses, .weight_age = sysfs_weights->age, .goal_tuner = sysfs_quotas->goal_tuner, + .fail_charge_num = sysfs_quotas->fail_charge_num, + .fail_charge_denom = sysfs_quotas->fail_charge_denom, }; struct damos_watermarks wmarks = { .metric = sysfs_wmarks->metric, @@ -2930,12 +3124,17 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - sysfs_regions->nr_regions++)) { - kobject_put(®ion->kobj); - return; - } + sysfs_regions->nr_regions)) + goto out; + if (damos_sysfs_region_add_dirs(region, ctx, r)) + goto out; + list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; + return; + +out: + kobject_put(®ion->kobj); } int damon_sysfs_schemes_clear_regions( diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index eefa959aa30a..2e95e3bac774 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -748,6 +748,497 @@ static const struct kobj_type damon_sysfs_intervals_ktype = { }; /* + * filter directory + */ + +struct damon_sysfs_filter { + struct kobject kobj; + enum damon_filter_type type; + bool matching; + bool allow; + char *path; +}; + +static struct damon_sysfs_filter *damon_sysfs_filter_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_filter); +} + +struct damon_sysfs_filter_type_name { + enum damon_filter_type type; + char *name; +}; + +static const struct damon_sysfs_filter_type_name +damon_sysfs_filter_type_names[] = { + { + .type = DAMON_FILTER_TYPE_ANON, + .name = "anon", + }, + { + .type = DAMON_FILTER_TYPE_MEMCG, + .name = "memcg", + }, +}; + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) { + const struct damon_sysfs_filter_type_name *type_name; + + type_name = &damon_sysfs_filter_type_names[i]; + if (type_name->type == filter->type) + return sysfs_emit(buf, "%s\n", type_name->name); + } + return -EINVAL; +} + +static ssize_t type_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + ssize_t ret = -EINVAL; + int i; + + for (i = 0; i < ARRAY_SIZE(damon_sysfs_filter_type_names); i++) { + const struct damon_sysfs_filter_type_name *type_name; + + type_name = &damon_sysfs_filter_type_names[i]; + if (sysfs_streq(buf, type_name->name)) { + filter->type = type_name->type; + ret = count; + break; + } + } + return ret; +} + +static ssize_t matching_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N'); +} + +static ssize_t matching_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + bool matching; + int err = kstrtobool(buf, &matching); + + if (err) + return err; + + filter->matching = matching; + return count; +} + +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + +static ssize_t path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + int len; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + len = sysfs_emit(buf, "%s\n", filter->path ? filter->path : ""); + mutex_unlock(&damon_sysfs_lock); + return len; +} + +static ssize_t path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + char *path = kmalloc_objs(*path, size_add(count, 1)); + + if (!path) + return -ENOMEM; + strscpy(path, buf, size_add(count, 1)); + if (!mutex_trylock(&damon_sysfs_lock)) { + kfree(path); + return -EBUSY; + } + kfree(filter->path); + filter->path = path; + mutex_unlock(&damon_sysfs_lock); + return count; +} + +static void damon_sysfs_filter_release(struct kobject *kobj) +{ + struct damon_sysfs_filter *filter = container_of(kobj, + struct damon_sysfs_filter, kobj); + + kfree(filter->path); + kfree(filter); +} + +static struct kobj_attribute damon_sysfs_filter_type_attr = + __ATTR_RW_MODE(type, 0600); + +static struct kobj_attribute damon_sysfs_filter_matching_attr = + __ATTR_RW_MODE(matching, 0600); + +static struct kobj_attribute damon_sysfs_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + +static struct kobj_attribute damon_sysfs_filter_path_attr = + __ATTR_RW_MODE(path, 0600); + +static struct attribute *damon_sysfs_filter_attrs[] = { + &damon_sysfs_filter_type_attr.attr, + &damon_sysfs_filter_matching_attr.attr, + &damon_sysfs_filter_allow_attr.attr, + &damon_sysfs_filter_path_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_filter); + +static const struct kobj_type damon_sysfs_filter_ktype = { + .release = damon_sysfs_filter_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_filter_groups, +}; + +/* + * filters directory + */ + +struct damon_sysfs_filters { + struct kobject kobj; + struct damon_sysfs_filter **filters_arr; + int nr; +}; + +static struct damon_sysfs_filters *damon_sysfs_filters_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_filters); +} + +static void damon_sysfs_filters_rm_dirs(struct damon_sysfs_filters *filters) +{ + struct damon_sysfs_filter **filters_arr = filters->filters_arr; + int i; + + for (i = 0; i < filters->nr; i++) + kobject_put(&filters_arr[i]->kobj); + filters->nr = 0; + kfree(filters_arr); + filters->filters_arr = NULL; +} + +static int damon_sysfs_filters_add_dirs( + struct damon_sysfs_filters *filters, int nr_filters) +{ + struct damon_sysfs_filter **filters_arr, *filter; + int err, i; + + damon_sysfs_filters_rm_dirs(filters); + if (!nr_filters) + return 0; + + filters_arr = kmalloc_objs(*filters_arr, nr_filters, + GFP_KERNEL | __GFP_NOWARN); + if (!filters_arr) + return -ENOMEM; + filters->filters_arr = filters_arr; + + for (i = 0; i < nr_filters; i++) { + filter = damon_sysfs_filter_alloc(); + if (!filter) { + damon_sysfs_filters_rm_dirs(filters); + return -ENOMEM; + } + + err = kobject_init_and_add(&filter->kobj, + &damon_sysfs_filter_ktype, &filters->kobj, + "%d", i); + if (err) { + kobject_put(&filter->kobj); + damon_sysfs_filters_rm_dirs(filters); + return err; + } + + filters_arr[i] = filter; + filters->nr++; + } + return 0; +} + +static ssize_t nr_filters_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_filters *filters = container_of(kobj, + struct damon_sysfs_filters, kobj); + + return sysfs_emit(buf, "%d\n", filters->nr); +} + +static ssize_t nr_filters_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_filters *filters; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + filters = container_of(kobj, struct damon_sysfs_filters, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_filters_add_dirs(filters, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_filters_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_filters, kobj)); +} + +static struct kobj_attribute damon_sysfs_filters_nr_attr = + __ATTR_RW_MODE(nr_filters, 0600); + +static struct attribute *damon_sysfs_filters_attrs[] = { + &damon_sysfs_filters_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_filters); + +static const struct kobj_type damon_sysfs_filters_ktype = { + .release = damon_sysfs_filters_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_filters_groups, +}; + +/* + * probe directory + */ + +struct damon_sysfs_probe { + struct kobject kobj; + struct damon_sysfs_filters *filters; +}; + +static struct damon_sysfs_probe *damon_sysfs_probe_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_probe); +} + +static int damon_sysfs_probe_add_dirs(struct damon_sysfs_probe *attr) +{ + struct damon_sysfs_filters *filters; + int err; + + filters = damon_sysfs_filters_alloc(); + if (!filters) + return -ENOMEM; + attr->filters = filters; + + err = kobject_init_and_add(&filters->kobj, &damon_sysfs_filters_ktype, + &attr->kobj, "filters"); + if (err) { + kobject_put(&filters->kobj); + attr->filters = NULL; + } + return err; +} + +static void damon_sysfs_probe_rm_dirs(struct damon_sysfs_probe *attr) +{ + if (attr->filters) { + damon_sysfs_filters_rm_dirs(attr->filters); + kobject_put(&attr->filters->kobj); + } +} + +static void damon_sysfs_probe_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_probe, kobj)); +} + +static struct attribute *damon_sysfs_probe_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_probe); + +static const struct kobj_type damon_sysfs_probe_ktype = { + .release = damon_sysfs_probe_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_probe_groups, +}; + +/* + * probes directory + */ + +struct damon_sysfs_probes { + struct kobject kobj; + struct damon_sysfs_probe **probes_arr; + int nr; +}; + +static struct damon_sysfs_probes *damon_sysfs_probes_alloc(void) +{ + return kzalloc_obj(struct damon_sysfs_probes); +} + +static void damon_sysfs_probes_rm_dirs( + struct damon_sysfs_probes *probes) +{ + struct damon_sysfs_probe **probes_arr = probes->probes_arr; + int i; + + for (i = 0; i < probes->nr; i++) { + damon_sysfs_probe_rm_dirs(probes_arr[i]); + kobject_put(&probes_arr[i]->kobj); + } + probes->nr = 0; + kfree(probes_arr); + probes->probes_arr = NULL; +} + +static int damon_sysfs_probes_add_dirs( + struct damon_sysfs_probes *probes, int nr_probes) +{ + struct damon_sysfs_probe **probes_arr, *probe; + int err, i; + + damon_sysfs_probes_rm_dirs(probes); + if (!nr_probes) + return 0; + + probes_arr = kmalloc_objs(*probes_arr, nr_probes, + GFP_KERNEL | __GFP_NOWARN); + if (!probes_arr) + return -ENOMEM; + probes->probes_arr = probes_arr; + + for (i = 0; i < nr_probes; i++) { + probe = damon_sysfs_probe_alloc(); + if (!probe) { + damon_sysfs_probes_rm_dirs(probes); + return -ENOMEM; + } + + err = kobject_init_and_add(&probe->kobj, + &damon_sysfs_probe_ktype, &probes->kobj, + "%d", i); + if (err) { + kobject_put(&probe->kobj); + damon_sysfs_probes_rm_dirs(probes); + return err; + } + + err = damon_sysfs_probe_add_dirs(probe); + if (err) { + kobject_put(&probe->kobj); + damon_sysfs_probes_rm_dirs(probes); + return err; + } + + probes_arr[i] = probe; + probes->nr++; + } + return 0; +} + +static ssize_t nr_probes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_probes *probes = container_of(kobj, + struct damon_sysfs_probes, kobj); + + return sysfs_emit(buf, "%d\n", probes->nr); +} + +static ssize_t nr_probes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_probes *probes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0 || nr > DAMON_MAX_PROBES) + return -EINVAL; + + probes = container_of(kobj, struct damon_sysfs_probes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_probes_add_dirs(probes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_probes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_probes, kobj)); +} + +static struct kobj_attribute damon_sysfs_probes_nr_probes = + __ATTR_RW_MODE(nr_probes, 0600); + +static struct attribute *damon_sysfs_probes_attrs[] = { + &damon_sysfs_probes_nr_probes.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_probes); + +static const struct kobj_type damon_sysfs_probes_ktype = { + .release = damon_sysfs_probes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_probes_groups, +}; + +/* * monitoring_attrs directory */ @@ -755,6 +1246,7 @@ struct damon_sysfs_attrs { struct kobject kobj; struct damon_sysfs_intervals *intervals; struct damon_sysfs_ul_range *nr_regions_range; + struct damon_sysfs_probes *probes; }; static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void) @@ -771,6 +1263,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) { struct damon_sysfs_intervals *intervals; struct damon_sysfs_ul_range *nr_regions_range; + struct damon_sysfs_probes *probes; int err; intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000); @@ -799,8 +1292,22 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) if (err) goto put_nr_regions_intervals_out; attrs->nr_regions_range = nr_regions_range; + + probes = damon_sysfs_probes_alloc(); + if (!probes) { + err = -ENOMEM; + goto put_nr_regions_intervals_out; + } + err = kobject_init_and_add(&probes->kobj, + &damon_sysfs_probes_ktype, &attrs->kobj, "probes"); + if (err) + goto put_probes_out; + attrs->probes = probes; return 0; +put_probes_out: + kobject_put(&probes->kobj); + attrs->probes = NULL; put_nr_regions_intervals_out: kobject_put(&nr_regions_range->kobj); attrs->nr_regions_range = NULL; @@ -817,6 +1324,8 @@ static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) kobject_put(&attrs->nr_regions_range->kobj); damon_sysfs_intervals_rm_dirs(attrs->intervals); kobject_put(&attrs->intervals->kobj); + damon_sysfs_probes_rm_dirs(attrs->probes); + kobject_put(&attrs->probes->kobj); } static void damon_sysfs_attrs_release(struct kobject *kobj) @@ -866,6 +1375,7 @@ struct damon_sysfs_context { struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; + bool pause; }; static struct damon_sysfs_context *damon_sysfs_context_alloc( @@ -878,6 +1388,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( context->kobj = (struct kobject){}; context->ops_id = ops_id; context->addr_unit = 1; + context->pause = false; return context; } @@ -1053,6 +1564,30 @@ static ssize_t addr_unit_store(struct kobject *kobj, return count; } +static ssize_t pause_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%c\n", context->pause ? 'Y' : 'N'); +} + +static ssize_t pause_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + bool pause; + int err = kstrtobool(buf, &pause); + + if (err) + return err; + context->pause = pause; + return count; +} + + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1067,10 +1602,14 @@ static struct kobj_attribute damon_sysfs_context_operations_attr = static struct kobj_attribute damon_sysfs_context_addr_unit_attr = __ATTR_RW_MODE(addr_unit, 0600); +static struct kobj_attribute damon_sysfs_context_pause_attr = + __ATTR_RW_MODE(pause, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, &damon_sysfs_context_addr_unit_attr.attr, + &damon_sysfs_context_pause_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1360,6 +1899,51 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, return damon_set_attrs(ctx, &attrs); } +static int damon_sysfs_set_probes(struct damon_ctx *ctx, + struct damon_sysfs_probes *sys_probes) +{ + int i; + + for (i = 0; i < sys_probes->nr; i++) { + struct damon_sysfs_filters *sys_filters = + sys_probes->probes_arr[i]->filters; + struct damon_probe *c; + int j; + + if (!sys_filters) + continue; + c = damon_new_probe(); + if (!c) + return -ENOMEM; + damon_add_probe(ctx, c); + + for (j = 0; j < sys_filters->nr; j++) { + struct damon_sysfs_filter *sys_filter = + sys_filters->filters_arr[j]; + struct damon_filter *filter; + + filter = damon_new_filter(sys_filter->type, + sys_filter->matching, + sys_filter->allow); + if (!filter) + return -ENOMEM; + if (filter->type == DAMON_FILTER_TYPE_MEMCG) { + int err; + + err = damon_sysfs_memcg_path_to_id( + sys_filter->path, + &filter->memcg_id); + if (err) { + damon_destroy_filter(filter); + return err; + } + } + damon_add_filter(c, filter); + } + } + return 0; +} + static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions, unsigned long min_region_sz) @@ -1470,9 +2054,13 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (sys_ctx->ops_id == DAMON_OPS_PADDR) ctx->min_region_sz = max( DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); + ctx->pause = sys_ctx->pause; err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; + err = damon_sysfs_set_probes(ctx, sys_ctx->attrs->probes); + if (err) + return err; err = damon_sysfs_add_targets(ctx, sys_ctx->targets); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 9e5904c2beeb..1cfb8c176b87 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -273,54 +273,70 @@ static void damon_test_merge_regions_of(struct kunit *test) static void damon_test_split_regions_of(struct kunit *test) { + struct damon_ctx *c; struct damon_target *t; struct damon_region *r; unsigned long sa[] = {0, 300, 500}; unsigned long ea[] = {220, 400, 700}; int i; + c = damon_new_ctx(); + if (!c) + kunit_skip(test, "ctx alloc fail"); + t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "target alloc fail"); + } r = damon_new_region(0, 22); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 2, 1); + damon_split_regions_of(c, t, 2, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "second target alloc fail"); + } r = damon_new_region(0, 220); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "second region alloc fail"); } damon_add_region(r, t); - damon_split_regions_of(t, 4, 1); + damon_split_regions_of(c, t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); t = damon_new_target(); - if (!t) + if (!t) { + damon_destroy_ctx(c); kunit_skip(test, "third target alloc fail"); + } for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); if (!r) { damon_free_target(t); + damon_destroy_ctx(c); kunit_skip(test, "region alloc fail"); } damon_add_region(r, t); } - damon_split_regions_of(t, 4, 5); + damon_split_regions_of(c, t, 4, 5); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u); damon_for_each_region(r, t) KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul); damon_free_target(t); + + damon_destroy_ctx(c); } static void damon_test_ops_registration(struct kunit *test) @@ -374,41 +390,139 @@ static void damon_test_ops_registration(struct kunit *test) } } -static void damon_test_set_regions(struct kunit *test) +static void damon_test_set_regions_for(struct kunit *test, + struct damon_addr_range *old_ranges, int sz_old_ranges, + struct damon_addr_range *new_ranges, int sz_new_ranges, + unsigned long min_region_sz, + struct damon_addr_range *expect_ranges, int sz_expect_ranges) { - struct damon_target *t = damon_new_target(); - struct damon_region *r1, *r2; - struct damon_addr_range range = {.start = 8, .end = 28}; - unsigned long expects[] = {8, 16, 16, 24, 24, 28}; - int expect_idx = 0; + struct damon_target *t; struct damon_region *r; + int i; + t = damon_new_target(); if (!t) kunit_skip(test, "target alloc fail"); - r1 = damon_new_region(4, 16); - if (!r1) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - r2 = damon_new_region(24, 32); - if (!r2) { - damon_free_target(t); - damon_free_region(r1); - kunit_skip(test, "second region alloc fail"); + for (i = 0; i < sz_old_ranges; i++) { + r = damon_new_region(old_ranges[i].start, old_ranges[i].end); + if (!r) { + damon_destroy_target(t, NULL); + kunit_skip(test, "%d-th r alloc fail\n", i); + } + damon_add_region(r, t); } - damon_add_region(r1, t); - damon_add_region(r2, t); - damon_set_regions(t, &range, 1, 1); + damon_set_regions(t, new_ranges, sz_new_ranges, min_region_sz); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), sz_expect_ranges); + if (damon_nr_regions(t) != sz_expect_ranges) { + damon_destroy_target(t, NULL); + return; + } + i = 0; damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); - KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.start, expect_ranges[i].start); + KUNIT_EXPECT_EQ(test, r->ar.end, expect_ranges[i++].end); } + damon_destroy_target(t, NULL); } +static void damon_test_set_regions(struct kunit *test) +{ + /* Initial build up on empty target. */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){}, 0, + (struct damon_addr_range[]){ + {.start = 5, .end = 15}, + {.start = 15, .end = 25}, + }, 2, + 1, + (struct damon_addr_range[]){ + {.start = 5, .end = 15}, + {.start = 15, .end = 25}, + }, 2); + /* Un-intersecting regions should be removed. */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 4, .end = 16}, + {.start = 24, .end = 32}, + }, 2, + (struct damon_addr_range[]){ + {.start = 18, .end = 23}, + }, 1, + 1, + (struct damon_addr_range[]){ + {.start = 18, .end = 23}, + }, 1); + /* + * Holes should be filled up with new regions. + * + * old: [4, 16) [24, 32) + * new: [8, 28) + * expect: [8, 16)[16,24),[24, 28) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 4, .end = 16}, + {.start = 24, .end = 32}, + }, 2, + (struct damon_addr_range[]){ + {.start = 8, .end = 28}, + }, 1, + 1, + (struct damon_addr_range[]){ + {.start = 8, .end = 16}, + {.start = 16, .end = 24}, + {.start = 24, .end = 28}, + }, 3); + /* + * New regions should be able to be appended. + * + * old: [0, 4)[4, 17) + * new: [0, 15) [25, 40) + * expect: [0, 4)[4, 15) [25, 40) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 4, .end = 17}, + }, 2, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + }, 2, + 1, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 4, .end = 15}, + {.start = 25, .end = 40}, + }, 3); + /* + * New regions should be able to be inserted. + * + * old: [0, 4) [42, 52) + * new: [0, 15) [25, 40) [44, 50) + * expect: [0, 15) [25, 40) [44, 50) + */ + damon_test_set_regions_for(test, + (struct damon_addr_range[]){ + {.start = 0, .end = 4}, + {.start = 42, .end = 52}, + }, 2, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + {.start = 44, .end = 50}, + }, 3, + 1, + (struct damon_addr_range[]){ + {.start = 0, .end = 15}, + {.start = 25, .end = 40}, + {.start = 44, .end = 50}, + }, 3); +} + static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) { struct damon_attrs attrs = { @@ -694,6 +808,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 2, .sz = 3, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, + .fail_charge_num = 2, + .fail_charge_denom = 3, .weight_sz = 4, .weight_nr_accesses = 5, .weight_age = 6, @@ -703,6 +819,8 @@ static void damos_test_commit_quota(struct kunit *test) .ms = 8, .sz = 9, .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, + .fail_charge_num = 1, + .fail_charge_denom = 1024, .weight_sz = 10, .weight_nr_accesses = 11, .weight_age = 12, @@ -717,6 +835,8 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.ms, src.ms); KUNIT_EXPECT_EQ(test, dst.sz, src.sz); KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner); + KUNIT_EXPECT_EQ(test, dst.fail_charge_num, src.fail_charge_num); + KUNIT_EXPECT_EQ(test, dst.fail_charge_denom, src.fail_charge_denom); KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); @@ -1077,6 +1197,10 @@ static void damon_test_commit_ctx(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); src->min_region_sz = 4095; KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL); + src->min_region_sz = 4096; + src->pause = true; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); + KUNIT_EXPECT_TRUE(test, dst->pause); damon_destroy_ctx(src); damon_destroy_ctx(dst); } diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 98e734d77d51..563fbc7e3f44 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -132,22 +132,35 @@ static void damon_do_test_apply_three_regions(struct kunit *test, unsigned long *expected, int nr_expected) { struct damon_target *t; + struct damon_addr_range *ranges; struct damon_region *r; int i; t = damon_new_target(); if (!t) kunit_skip(test, "target alloc fail"); + + ranges = kmalloc_array(nr_regions / 2, sizeof(*ranges), GFP_KERNEL); + if (!ranges) { + damon_destroy_target(t, NULL); + kunit_skip(test, "ranges alloc fail"); + } for (i = 0; i < nr_regions / 2; i++) { - r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); - if (!r) { - damon_destroy_target(t, NULL); - kunit_skip(test, "region alloc fail"); - } - damon_add_region(r, t); + ranges[i].start = regions[i * 2]; + ranges[i].end = regions[i * 2 + 1]; } + if (damon_set_regions(t, ranges, nr_regions / 2, + DAMON_MIN_REGION_SZ)) { + kfree(ranges); + damon_destroy_target(t, NULL); + kunit_skip(test, "damon_set_regions() fail"); + } + kfree(ranges); - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); + if (damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ)) { + damon_destroy_target(t, NULL); + kunit_skip(test, "second damon_set_regions() fail"); + } for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b069dbc7e3d2..d27147603564 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -237,6 +237,35 @@ static void damon_va_update(struct damon_ctx *ctx) } } +static void damon_va_walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mm_walk_ops *ops, void *private) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, start); + if (!vma) + goto lock_mmap; + + if (end > vma->vm_end) { + vma_end_read(vma); + goto lock_mmap; + } + + if (!(vma->vm_flags & VM_PFNMAP)) { + ops->walk_lock = PGWALK_VMA_RDLOCK_VERIFY; + walk_page_range_vma(vma, start, end, ops, private); + } + + vma_end_read(vma); + return; + +lock_mmap: + mmap_read_lock(mm); + ops->walk_lock = PGWALK_RDLOCK; + walk_page_range(mm, start, end, ops, private); + mmap_read_unlock(mm); +} + static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -315,17 +344,14 @@ out: #define damon_mkold_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static const struct mm_walk_ops damon_mkold_ops = { - .pmd_entry = damon_mkold_pmd_entry, - .hugetlb_entry = damon_mkold_hugetlb_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) { - mmap_read_lock(mm); - walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); - mmap_read_unlock(mm); + struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, + }; + + damon_va_walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); } /* @@ -333,9 +359,10 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) */ static void __damon_va_prepare_access_check(struct mm_struct *mm, - struct damon_region *r) + struct damon_region *r, + struct damon_ctx *ctx) { - r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + r->sampling_addr = damon_rand(ctx, r->ar.start, r->ar.end); damon_va_mkold(mm, r->sampling_addr); } @@ -351,7 +378,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(mm, r); + __damon_va_prepare_access_check(mm, r, ctx); mmput(mm); } } @@ -444,12 +471,6 @@ out: #define damon_young_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static const struct mm_walk_ops damon_young_ops = { - .pmd_entry = damon_young_pmd_entry, - .hugetlb_entry = damon_young_hugetlb_entry, - .walk_lock = PGWALK_RDLOCK, -}; - static bool damon_va_young(struct mm_struct *mm, unsigned long addr, unsigned long *folio_sz) { @@ -458,9 +479,12 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, .young = false, }; - mmap_read_lock(mm); - walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); - mmap_read_unlock(mm); + struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, + }; + + damon_va_walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); return arg.young; } @@ -749,7 +773,6 @@ static unsigned long damos_va_migrate(struct damon_target *target, struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_migrate_pmd_entry, .pte_entry = NULL, - .walk_lock = PGWALK_RDLOCK, }; use_target_nid = dests->nr_dests == 0; @@ -767,9 +790,7 @@ static unsigned long damos_va_migrate(struct damon_target *target, if (!mm) goto free_lists; - mmap_read_lock(mm); - walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); - mmap_read_unlock(mm); + damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); mmput(mm); for (int i = 0; i < nr_dests; i++) { @@ -861,7 +882,6 @@ static unsigned long damos_va_stat(struct damon_target *target, struct mm_struct *mm; struct mm_walk_ops walk_ops = { .pmd_entry = damos_va_stat_pmd_entry, - .walk_lock = PGWALK_RDLOCK, }; priv.scheme = s; @@ -874,9 +894,7 @@ static unsigned long damos_va_stat(struct damon_target *target, if (!mm) return 0; - mmap_read_lock(mm); - walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); - mmap_read_unlock(mm); + damon_va_walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); mmput(mm); return 0; } @@ -903,6 +921,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_COLLAPSE: + madv_action = MADV_COLLAPSE; + break; case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); diff --git a/mm/filemap.c b/mm/filemap.c index 179f2886f8c0..7e467c81d213 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1808,9 +1808,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); - unsigned long nr = max_scan; - while (nr--) { + while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; @@ -1818,7 +1817,8 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, return 0; } - return index + max_scan; + /* Return end of the range + 1 when no hole is found */ + return xas.xa_index + 1; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1849,12 +1849,13 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - break; + return xas.xa_index; if (xas.xa_index == ULONG_MAX) - break; + return ULONG_MAX; } - return xas.xa_index; + /* Return start of the range - 1 when no hole is found */ + return xas.xa_index - 1; } EXPORT_SYMBOL(page_cache_prev_miss); @@ -2294,8 +2295,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, goto put_folio; if (!folio_batch_add(fbatch, folio)) { - nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } xas_advance(&xas, folio_next_index(folio) - 1); @@ -2355,8 +2355,7 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } } @@ -2414,8 +2413,7 @@ unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, } } if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; + *start = folio_next_index(folio); goto out; } } @@ -3323,12 +3321,26 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; bool force_thp_readahead = false; + unsigned int thp_order = 0; unsigned short mmap_miss; + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; + /* Use the readahead code, even if readahead is disabled */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) - force_thp_readahead = true; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (vm_flags & VM_HUGEPAGE)) { + /* + * Cap max THP order at 2MB: this is the common PMD-sized + * hugepage size, and it avoids memory pressure from very + * large forced readahead when mapping_max_folio_order() is + * high (for example, 128MB with 64K base pages on arm64). + */ + if (mapping_large_folio_support(mapping)) { + force_thp_readahead = true; + thp_order = min_t(unsigned int, + mapping_max_folio_order(mapping), + get_order(SZ_2M)); + } + } if (!force_thp_readahead) { /* @@ -3348,7 +3360,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } } - if (!(vm_flags & VM_SEQ_READ)) { + if (!(vm_flags & (VM_SEQ_READ | VM_EXEC))) { /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) @@ -3363,17 +3375,19 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) } if (force_thp_readahead) { + unsigned long folio_nr_pages = 1UL << thp_order; + fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); - ra->size = HPAGE_PMD_NR; + ractl._index &= ~(folio_nr_pages - 1); + ra->size = folio_nr_pages; /* - * Fetch two PMD folios, so we get the chance to actually + * Fetch two folios so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; - ra->async_size = HPAGE_PMD_NR; - ra->order = HPAGE_PMD_ORDER; + ra->async_size = folio_nr_pages; + ra->order = thp_order; page_cache_ra_order(&ractl, ra); return fpin; } @@ -3407,6 +3421,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->start = max(ra->start, vmf->vma->vm_pgoff); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra->order = 0; @@ -3441,14 +3456,20 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * Don't touch the mmap_miss counter to avoid decreasing it multiple * times for a single folio and break the balance with mmap_miss * increase in do_sync_mmap_readahead(). + * + * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss increment in + * do_sync_mmap_readahead(), so skip the decrement here as well to + * keep the counter symmetric. */ - if (likely(!folio_test_locked(folio))) { + if (likely(!folio_test_locked(folio)) && + !(vmf->vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) { mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); } if (folio_test_readahead(folio)) { + ractl._max_index = vmf->vma->vm_pgoff + vma_pages(vmf->vma) - 1; fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } @@ -3758,8 +3779,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss, - pgoff_t file_end) + unsigned long *rss, pgoff_t file_end) { struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; @@ -3792,16 +3812,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, goto skip; /* - * If there are too many folios that are recently evicted - * in a file, they will probably continue to be evicted. - * In such situation, read-ahead is only a waste of IO. - * Don't decrease mmap_miss in this scenario to make sure - * we can stop read-ahead. - */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. @@ -3847,7 +3857,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3855,10 +3865,6 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) goto out; - /* See comment of filemap_map_folio_range() */ - if (!folio_test_workingset(folio)) - (*mmap_miss)++; - /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit @@ -3893,7 +3899,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; - unsigned short mmap_miss = 0, mmap_miss_saved; /* * Recalculate end_pgoff based on file_end before calling @@ -3932,6 +3937,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_type = mm_counter_file(folio); do { unsigned long end; + vm_fault_t map_ret; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; @@ -3939,13 +3945,40 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - if (!folio_test_large(folio)) - ret |= filemap_map_order0_folio(vmf, - folio, addr, &rss, &mmap_miss); - else - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss, file_end); + if (!folio_test_large(folio)) { + map_ret = filemap_map_order0_folio(vmf, folio, addr, + &rss); + } else { + unsigned long start = xas.xa_index - folio->index; + + map_ret = filemap_map_folio_range(vmf, folio, start, + addr, nr_pages, &rss, + file_end); + } + ret |= map_ret; + + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + * + * VM_SEQ_READ and VM_EXEC mappings skip the mmap_miss + * increment in do_sync_mmap_readahead(), so skip the + * decrement here as well to keep the counter symmetric. + */ + if ((map_ret & VM_FAULT_NOPAGE) && + !(vmf->flags & FAULT_FLAG_TRIED) && + !folio_test_workingset(folio) && + !(vma->vm_flags & (VM_SEQ_READ | VM_EXEC))) { + unsigned short mmap_miss; + + mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss) + WRITE_ONCE(file->f_ra.mmap_miss, + mmap_miss - 1); + } folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -3955,12 +3988,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, out: rcu_read_unlock(); - mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); - if (mmap_miss >= mmap_miss_saved) - WRITE_ONCE(file->f_ra.mmap_miss, 0); - else - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); - return ret; } EXPORT_SYMBOL(filemap_map_pages); @@ -2865,8 +2865,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + if (unlikely(pmd_val(pmd) != pmd_val(pmdp_get_lockless(pmdp))) || + unlikely(pte_val(pte) != pte_val(ptep_get_lockless(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2942,7 +2942,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!folio) return 0; - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + if (unlikely(pmd_val(orig) != pmd_val(pmdp_get_lockless(pmdp)))) { gup_put_folio(folio, refs, flags); return 0; } @@ -2985,7 +2985,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!folio) return 0; - if (unlikely(pud_val(orig) != pud_val(*pudp))) { + if (unlikely(pud_val(orig) != pud_val(pudp_get(pudp)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d29e85495091..64492dcb9d1e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -14,6 +14,7 @@ #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/list_lru.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/swapops.h> @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +static struct lock_class_key deferred_split_key; +static struct list_lru deferred_split_lru; static struct shrinker *deferred_split_shrinker; static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc); @@ -429,61 +432,75 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } +enum defrag_mode { + DEFRAG_ALWAYS = 0, + DEFRAG_DEFER, + DEFRAG_DEFER_MADVISE, + DEFRAG_MADVISE, + DEFRAG_NEVER, +}; + +static const char * const defrag_mode_strings[] = { + [DEFRAG_ALWAYS] = "always", + [DEFRAG_DEFER] = "defer", + [DEFRAG_DEFER_MADVISE] = "defer+madvise", + [DEFRAG_MADVISE] = "madvise", + [DEFRAG_NEVER] = "never", +}; + +static const enum transparent_hugepage_flag defrag_flags[] = { + [DEFRAG_ALWAYS] = TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + [DEFRAG_DEFER] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + [DEFRAG_DEFER_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + [DEFRAG_MADVISE] = TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +}; + static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - const char *output; + int active = DEFRAG_NEVER; + int len = 0; + int i; - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) - output = "[always] defer defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - output = "always [defer] defer+madvise madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer [defer+madvise] madvise never"; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags)) - output = "always defer defer+madvise [madvise] never"; - else - output = "always defer defer+madvise madvise [never]"; + for (i = 0; i < ARRAY_SIZE(defrag_flags); i++) { + if (test_bit(defrag_flags[i], &transparent_hugepage_flags)) { + active = i; + break; + } + } - return sysfs_emit(buf, "%s\n", output); + for (i = 0; i < ARRAY_SIZE(defrag_mode_strings); i++) { + if (i == active) + len += sysfs_emit_at(buf, len, "[%s] ", + defrag_mode_strings[i]); + else + len += sysfs_emit_at(buf, len, "%s ", + defrag_mode_strings[i]); + } + + /* Replace trailing space with newline */ + buf[len - 1] = '\n'; + + return len; } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "always")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer+madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "defer")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "never")) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else + int mode, m; + + mode = sysfs_match_string(defrag_mode_strings, buf); + if (mode < 0) return -EINVAL; + for (m = 0; m < ARRAY_SIZE(defrag_flags); m++) { + if (m == mode) + set_bit(defrag_flags[m], &transparent_hugepage_flags); + else + clear_bit(defrag_flags[m], &transparent_hugepage_flags); + } + return count; } static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); @@ -918,15 +935,28 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) } #endif /* CONFIG_SYSFS */ +int folio_memcg_alloc_deferred(struct folio *folio) +{ + if (mem_cgroup_disabled()) + return 0; + return folio_memcg_list_lru_alloc(folio, &deferred_split_lru, GFP_KERNEL); +} + static int __init thp_shrinker_init(void) { deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | - SHRINKER_MEMCG_AWARE | - SHRINKER_NONSLAB, + SHRINKER_MEMCG_AWARE, "thp-deferred_split"); if (!deferred_split_shrinker) return -ENOMEM; + if (list_lru_init_memcg_key(&deferred_split_lru, + deferred_split_shrinker, + &deferred_split_key)) { + shrinker_free(deferred_split_shrinker); + return -ENOMEM; + } + deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); @@ -948,6 +978,7 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); if (!huge_zero_folio_shrinker) { shrinker_free(deferred_split_shrinker); + list_lru_destroy(&deferred_split_lru); return -ENOMEM; } @@ -962,6 +993,7 @@ static void __init thp_shrinker_exit(void) { shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); + list_lru_destroy(&deferred_split_lru); } static int __init hugepage_init(void) @@ -1141,119 +1173,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static struct deferred_split *split_queue_node(int nid) -{ - struct pglist_data *pgdata = NODE_DATA(nid); - - return &pgdata->deferred_split_queue; -} - -#ifdef CONFIG_MEMCG -static inline -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, - struct deferred_split *queue) -{ - if (mem_cgroup_disabled()) - return NULL; - if (split_queue_node(folio_nid(folio)) == queue) - return NULL; - return container_of(queue, struct mem_cgroup, deferred_split_queue); -} - -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) -{ - return memcg ? &memcg->deferred_split_queue : split_queue_node(nid); -} -#else -static inline -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio, - struct deferred_split *queue) -{ - return NULL; -} - -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg) -{ - return split_queue_node(nid); -} -#endif - -static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg) -{ - struct deferred_split *queue; - -retry: - queue = memcg_split_queue(nid, memcg); - spin_lock(&queue->split_queue_lock); - /* - * There is a period between setting memcg to dying and reparenting - * deferred split queue, and during this period the THPs in the deferred - * split queue will be hidden from the shrinker side. - */ - if (unlikely(memcg_is_dying(memcg))) { - spin_unlock(&queue->split_queue_lock); - memcg = parent_mem_cgroup(memcg); - goto retry; - } - - return queue; -} - -static struct deferred_split * -split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags) -{ - struct deferred_split *queue; - -retry: - queue = memcg_split_queue(nid, memcg); - spin_lock_irqsave(&queue->split_queue_lock, *flags); - if (unlikely(memcg_is_dying(memcg))) { - spin_unlock_irqrestore(&queue->split_queue_lock, *flags); - memcg = parent_mem_cgroup(memcg); - goto retry; - } - - return queue; -} - -static struct deferred_split *folio_split_queue_lock(struct folio *folio) -{ - struct deferred_split *queue; - - rcu_read_lock(); - queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); - /* - * The memcg destruction path is acquiring the split queue lock for - * reparenting. Once you have it locked, it's safe to drop the rcu lock. - */ - rcu_read_unlock(); - - return queue; -} - -static struct deferred_split * -folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) -{ - struct deferred_split *queue; - - rcu_read_lock(); - queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); - rcu_read_unlock(); - - return queue; -} - -static inline void split_queue_unlock(struct deferred_split *queue) -{ - spin_unlock(&queue->split_queue_lock); -} - -static inline void split_queue_unlock_irqrestore(struct deferred_split *queue, - unsigned long flags) -{ - spin_unlock_irqrestore(&queue->split_queue_lock, flags); -} - static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) @@ -1354,6 +1273,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return NULL; } + + if (folio_memcg_alloc_deferred(folio)) { + folio_put(folio); + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + folio_throttle_swaprate(folio, gfp); /* @@ -2638,6 +2565,8 @@ static void change_non_present_huge_pmd(struct mm_struct *mm, } else if (softleaf_is_device_private_write(entry)) { entry = make_readable_device_private_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_uffd_wp(*pmd)) + newpmd = pmd_swp_mkuffd_wp(newpmd); } else { newpmd = *pmd; } @@ -3890,34 +3819,43 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n struct folio *end_folio = folio_next(folio); struct folio *new_folio, *next; int old_order = folio_order(folio); + struct list_lru_one *lru; + bool dequeue_deferred; int ret = 0; - struct deferred_split *ds_queue; VM_WARN_ON_ONCE(!mapping && end); - /* Prevent deferred_split_scan() touching ->_refcount */ - ds_queue = folio_split_queue_lock(folio); + /* + * If this folio can be on the deferred split queue, lock out + * the shrinker before freezing the ref. If the shrinker sees + * a 0-ref folio, it assumes it beat folio_put() to the list + * lock and must clean up the LRU state - the same dequeue we + * will do below as part of the split. + */ + dequeue_deferred = folio_test_anon(folio) && old_order > 1; + if (dequeue_deferred) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock(&deferred_split_lru, + folio_nid(folio), &memcg); + } if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) { struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; - if (old_order > 1) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } + if (dequeue_deferred) { + __list_lru_del(&deferred_split_lru, lru, + &folio->_deferred_list, folio_nid(folio)); if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(old_order, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } + list_lru_unlock(lru); + rcu_read_unlock(); } - split_queue_unlock(ds_queue); + if (mapping) { int nr = folio_nr_pages(folio); @@ -4018,7 +3956,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n if (ci) swap_cluster_unlock(ci); } else { - split_queue_unlock(ds_queue); + if (dequeue_deferred) { + list_lru_unlock(lru); + rcu_read_unlock(); + } return -EAGAIN; } @@ -4193,11 +4134,10 @@ fail: folio_unlock(new_folio); /* - * Subpages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. + * Subpages whose mapping has been zapped may be freed + * earlier, but freeing them requires taking the + * lru_lock, so we defer put_page() on tail pages until + * after the split completes. */ free_folio_and_swap_cache(new_folio); } @@ -4385,33 +4325,37 @@ int split_folio_to_list(struct folio *folio, struct list_head *list) * queueing THP splits, and that list is (racily observed to be) non-empty. * * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is - * zero: because even when split_queue_lock is held, a non-empty _deferred_list - * might be in use on deferred_split_scan()'s unlocked on-stack list. + * zero: because even when the list_lru lock is held, a non-empty + * _deferred_list might be in use on deferred_split_scan()'s unlocked + * on-stack list. * - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is - * therefore important to unqueue deferred split before changing folio memcg. + * The list_lru sublist is determined by folio's memcg: it is therefore + * important to unqueue deferred split before changing folio memcg. */ bool __folio_unqueue_deferred_split(struct folio *folio) { - struct deferred_split *ds_queue; + struct mem_cgroup *memcg; + struct list_lru_one *lru; + int nid = folio_nid(folio); unsigned long flags; bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio)); - ds_queue = folio_split_queue_lock_irqsave(folio, &flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags); + if (__list_lru_del(&deferred_split_lru, lru, &folio->_deferred_list, nid)) { if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } - list_del_init(&folio->_deferred_list); unqueued = true; } - split_queue_unlock_irqrestore(ds_queue, flags); + list_lru_unlock_irqrestore(lru, &flags); + rcu_read_unlock(); return unqueued; /* useful for debug warnings */ } @@ -4419,7 +4363,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { - struct deferred_split *ds_queue; + struct list_lru_one *lru; + int nid; + struct mem_cgroup *memcg; unsigned long flags; /* @@ -4434,7 +4380,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) /* * Exclude swapcache: originally to avoid a corrupt deferred split - * queue. Nowadays that is fully prevented by memcg1_swapout(); + * queue. Nowadays that is fully prevented by __memcg1_swapout(); * but if page reclaim is already handling the same folio, it is * unnecessary to handle it again in the shrinker, so excluding * swapcache here may still be a useful optimization. @@ -4442,7 +4388,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) if (folio_test_swapcache(folio)) return; - ds_queue = folio_split_queue_lock_irqsave(folio, &flags); + nid = folio_nid(folio); + + rcu_read_lock(); + memcg = folio_memcg(folio); + lru = list_lru_lock_irqsave(&deferred_split_lru, nid, &memcg, &flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); @@ -4450,36 +4400,23 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); - } } else { /* partially mapped folios cannot become non-partially mapped */ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } - if (list_empty(&folio->_deferred_list)) { - struct mem_cgroup *memcg; - - memcg = folio_split_queue_memcg(folio, ds_queue); - list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); - ds_queue->split_queue_len++; - if (memcg) - set_shrinker_bit(memcg, folio_nid(folio), - shrinker_id(deferred_split_shrinker)); - } - split_queue_unlock_irqrestore(ds_queue, flags); + __list_lru_add(&deferred_split_lru, lru, &folio->_deferred_list, nid, memcg); + list_lru_unlock_irqrestore(lru, &flags); + rcu_read_unlock(); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { - struct pglist_data *pgdata = NODE_DATA(sc->nid); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + unsigned long count; -#ifdef CONFIG_MEMCG - if (sc->memcg) - ds_queue = &sc->memcg->deferred_split_queue; -#endif - return READ_ONCE(ds_queue->split_queue_len); + count = list_lru_shrink_count(&deferred_split_lru, sc); + return count ?: SHRINK_EMPTY; } static bool thp_underused(struct folio *folio) @@ -4509,45 +4446,49 @@ static bool thp_underused(struct folio *folio) return false; } +static enum lru_status deferred_split_isolate(struct list_head *item, + struct list_lru_one *lru, + void *cb_arg) +{ + struct folio *folio = container_of(item, struct folio, _deferred_list); + struct list_head *freeable = cb_arg; + + if (folio_try_get(folio)) { + list_lru_isolate_move(lru, item, freeable); + return LRU_REMOVED; + } + + /* + * We lost race with folio_put(). Read folio state before the + * isolate: folio_unqueue_deferred_split() checks list_empty() + * locklessly, so once removed the folio can be freed any time. + */ + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } + list_lru_isolate(lru, item); + return LRU_REMOVED; +} + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct deferred_split *ds_queue; - unsigned long flags; + LIST_HEAD(dispose); struct folio *folio, *next; - int split = 0, i; - struct folio_batch fbatch; - - folio_batch_init(&fbatch); + int split = 0; + unsigned long isolated; -retry: - ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags); - /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_entry_safe(folio, next, &ds_queue->split_queue, - _deferred_list) { - if (folio_try_get(folio)) { - folio_batch_add(&fbatch, folio); - } else if (folio_test_partially_mapped(folio)) { - /* We lost race with folio_put() */ - folio_clear_partially_mapped(folio); - mod_mthp_stat(folio_order(folio), - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; - if (!--sc->nr_to_scan) - break; - if (!folio_batch_space(&fbatch)) - break; - } - split_queue_unlock_irqrestore(ds_queue, flags); + isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc, + deferred_split_isolate, &dispose); - for (i = 0; i < folio_batch_count(&fbatch); i++) { + list_for_each_entry_safe(folio, next, &dispose, _deferred_list) { bool did_split = false; bool underused = false; - struct deferred_split *fqueue; - folio = fbatch.folios[i]; + list_del_init(&folio->_deferred_list); + if (!folio_test_partially_mapped(folio)) { /* * See try_to_map_unused_to_zeropage(): we cannot @@ -4576,63 +4517,23 @@ next: * underused, then consider it used and don't add it back to * split_queue. */ - if (did_split || !folio_test_partially_mapped(folio)) - continue; + if (!did_split && folio_test_partially_mapped(folio)) { requeue: - /* - * Add back partially mapped folios, or underused folios that - * we could not lock this round. - */ - fqueue = folio_split_queue_lock_irqsave(folio, &flags); - if (list_empty(&folio->_deferred_list)) { - list_add_tail(&folio->_deferred_list, &fqueue->split_queue); - fqueue->split_queue_len++; + rcu_read_lock(); + list_lru_add_irq(&deferred_split_lru, + &folio->_deferred_list, + folio_nid(folio), + folio_memcg(folio)); + rcu_read_unlock(); } - split_queue_unlock_irqrestore(fqueue, flags); - } - folios_put(&fbatch); - - if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) { - cond_resched(); - goto retry; + folio_put(folio); } - /* - * Stop shrinker if we didn't split any page, but the queue is empty. - * This can happen if pages were freed under us. - */ - if (!split && list_empty(&ds_queue->split_queue)) + if (!split && !isolated) return SHRINK_STOP; return split; } -#ifdef CONFIG_MEMCG -void reparent_deferred_split_queue(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct deferred_split *ds_queue = &memcg->deferred_split_queue; - struct deferred_split *parent_ds_queue = &parent->deferred_split_queue; - int nid; - - spin_lock_irq(&ds_queue->split_queue_lock); - spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING); - - if (!ds_queue->split_queue_len) - goto unlock; - - list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue); - parent_ds_queue->split_queue_len += ds_queue->split_queue_len; - ds_queue->split_queue_len = 0; - - for_each_node(nid) - set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker)); - -unlock: - spin_unlock(&parent_ds_queue->split_queue_lock); - spin_unlock_irq(&ds_queue->split_queue_lock); -} -#endif - #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c921287489de..571212b80835 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2862,6 +2862,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct hugetlb_cgroup *h_cg_rsvd = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); @@ -2912,7 +2913,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( - idx, pages_per_huge_page(h), &h_cg); + idx, pages_per_huge_page(h), &h_cg_rsvd); if (ret) goto out_subpool_put; } @@ -2954,7 +2955,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, folio); + h_cg_rsvd, folio); } spin_unlock_irq(&hugetlb_lock); @@ -3006,7 +3007,7 @@ out_uncharge_cgroup: out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), - h_cg); + h_cg_rsvd); out_subpool_put: /* * put page to subpool iff the quota of subpool's rsv_hpages is used diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index 7693ccefd0c6..39344d6c78d8 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -142,7 +142,7 @@ unsigned int __weak arch_hugetlb_cma_order(void) void __init hugetlb_cma_reserve(void) { - unsigned long size, reserved, per_node, order; + unsigned long size, reserved, per_node, order, gigantic_page_size; bool node_specific_cma_alloc = false; int nid; @@ -162,37 +162,36 @@ void __init hugetlb_cma_reserve(void) * breaking this assumption. */ VM_WARN_ON(order <= MAX_PAGE_ORDER); + gigantic_page_size = PAGE_SIZE << order; hugetlb_bootmem_set_nodes(); for (nid = 0; nid < MAX_NUMNODES; nid++) { - if (hugetlb_cma_size_in_node[nid] == 0) + size = hugetlb_cma_size_in_node[nid]; + if (size == 0) continue; if (!node_isset(nid, hugetlb_bootmem_nodes)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - continue; - } - - if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", - nid, (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; + } else if (!IS_ALIGNED(size, gigantic_page_size)) { + pr_warn("hugetlb_cma: cma area of node %d must be a multiple of %lu MiB\n", + nid, gigantic_page_size / SZ_1M); } else { node_specific_cma_alloc = true; + continue; } + + hugetlb_cma_size -= size; + hugetlb_cma_size_in_node[nid] = 0; } /* Validate the CMA size again in case some invalid nodes specified. */ if (!hugetlb_cma_size) return; - if (hugetlb_cma_size < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", - (PAGE_SIZE << order) / SZ_1M); + if (!IS_ALIGNED(hugetlb_cma_size, gigantic_page_size)) { + pr_warn("hugetlb_cma: cma area must be a multiple of %lu MiB\n", + gigantic_page_size / SZ_1M); hugetlb_cma_size = 0; return; } @@ -204,7 +203,7 @@ void __init hugetlb_cma_reserve(void) */ per_node = DIV_ROUND_UP(hugetlb_cma_size, nodes_weight(hugetlb_bootmem_nodes)); - per_node = round_up(per_node, PAGE_SIZE << order); + per_node = round_up(per_node, gigantic_page_size); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } @@ -223,15 +222,13 @@ void __init hugetlb_cma_reserve(void) size = min(per_node, hugetlb_cma_size - reserved); } - size = round_up(size, PAGE_SIZE << order); - snprintf(name, sizeof(name), "hugetlb%d", nid); /* * Note that 'order per bit' is based on smallest size that * may be returned to CMA allocator in the case of * huge page demotion. */ - res = cma_declare_contiguous_multi(size, PAGE_SIZE << order, + res = cma_declare_contiguous_multi(size, gigantic_page_size, HUGETLB_PAGE_ORDER, name, &hugetlb_cma[nid], nid); if (res) { diff --git a/mm/internal.h b/mm/internal.h index 5a2ddcf68e0b..181e79f1d6a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -17,7 +17,6 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/leafops.h> -#include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ @@ -451,24 +450,16 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; - const softleaf_t entry = softleaf_from_pte(pte); pte_t *ptep = start_ptep + 1; - unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); - VM_WARN_ON(!softleaf_is_swap(entry)); + VM_WARN_ON(!softleaf_is_swap(softleaf_from_pte(pte))); - cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { - softleaf_t entry; - pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - entry = softleaf_from_pte(pte); - if (lookup_swap_cgroup_id(entry) != cgroup_id) - break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } @@ -861,7 +852,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio) /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe - * to check without acquiring the split_queue_lock. + * to check without acquiring the list_lru lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; @@ -1104,9 +1095,17 @@ static inline void init_cma_pageblock(struct page *page) } #endif - -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claimable); +enum fallback_result { + /* Found suitable migratetype, *mt_out is valid. */ + FALLBACK_FOUND, + /* No fallback found in requested order. */ + FALLBACK_EMPTY, + /* Passed @claimable, but claiming whole block is a bad idea. */ + FALLBACK_NOCLAIM, +}; +enum fallback_result +find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool claimable, int *mt_out); static inline bool free_area_empty(struct free_area *area, int migratetype) { diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index e41ba69592ef..b9e167ed5be3 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -874,6 +874,16 @@ static void kmalloc_double_kzfree(struct kunit *test) char *ptr; size_t size = 16; + /* + * With the tag-based KASAN modes, if the memory happens to be + * reallocated between the two frees and the new allocation tag happens + * to match the old one, the second free will cause a memory corruption. + * Resolving https://bugzilla.kernel.org/show_bug.cgi?id=212177 would + * help to deal with this. With Generic KASAN, it's effectively + * impossible for the memory to get reallocated due to the quarantine. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index c6048f57bae9..de2d0f7d62b1 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -263,7 +263,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat break; } - kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + kunit_info(test, "%s: size=%zu, gfp=%pGg, policy=%s, cache=%i\n", __func__, size, &gfp, policy_name, !!test_cache); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b8452dbdb043..73e262cb30dd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -437,13 +437,16 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(collapse_test_exit(mm), mm); - if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) - return; slot = mm_slot_alloc(mm_slot_cache); if (!slot) return; + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) { + mm_slot_free(mm_slot_cache, slot); + return; + } + spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -1120,6 +1123,11 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a if (result != SCAN_SUCCEED) goto out_nolock; + if (folio_memcg_alloc_deferred(folio)) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } + mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { @@ -2528,8 +2536,8 @@ static void collapse_scan_mm_slot(unsigned int progress_max, cc->progress++; continue; } - hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); - hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); + hstart = ALIGN(vma->vm_start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { cc->progress++; continue; @@ -2808,6 +2816,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_HAS_PRIVATE: case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* @@ -2845,8 +2854,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, mmgrab(mm); lru_add_drain_all(); - hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = end & HPAGE_PMD_MASK; + hstart = ALIGN(start, HPAGE_PMD_SIZE); + hend = ALIGN_DOWN(end, HPAGE_PMD_SIZE); for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2eff0d6b622b..7c7ba17ce7af 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,6 +92,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/workqueue.h> +#include <linux/xarray.h> #include <linux/crc32.h> #include <asm/sections.h> @@ -157,6 +158,8 @@ struct kmemleak_object { struct hlist_head area_list; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ + /* per-scan dedup count, valid only while in scan-local dedup xarray */ + unsigned int dup_count; char comm[TASK_COMM_LEN]; /* executable name */ }; @@ -360,8 +363,9 @@ static const char *__object_type_str(struct kmemleak_object *object) * Printing of the unreferenced objects information to the seq file. The * print_unreferenced function must be called with the object->lock held. */ -static void print_unreferenced(struct seq_file *seq, - struct kmemleak_object *object) +static void __print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object, + bool hex_dump) { int i; unsigned long *entries; @@ -373,7 +377,8 @@ static void print_unreferenced(struct seq_file *seq, object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); - hex_dump_object(seq, object); + if (hex_dump) + hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace (crc %x):\n", object->checksum); for (i = 0; i < nr_entries; i++) { @@ -382,6 +387,12 @@ static void print_unreferenced(struct seq_file *seq, } } +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + __print_unreferenced(seq, object, true); +} + /* * Print the kmemleak_object information. This function is used mainly for * debugging special cases when kmemleak operations. It must be called with @@ -1685,6 +1696,103 @@ unlock_put: } /* + * Print one leak inline. The hex dump is gated on OBJECT_ALLOCATED so it + * does not touch user memory that was freed concurrently; the rest of the + * report (backtrace, comm, pid) is always emitted since the kmemleak_object + * metadata is pinned by the caller. + */ +static void print_leak_locked(struct kmemleak_object *object, bool hex_dump) +{ + raw_spin_lock_irq(&object->lock); + __print_unreferenced(NULL, object, + hex_dump && (object->flags & OBJECT_ALLOCATED)); + raw_spin_unlock_irq(&object->lock); +} + +/* + * Per-scan dedup table for verbose leak printing. The xarray is keyed by + * stackdepot trace_handle and stores a pointer to the representative + * kmemleak_object. The per-scan repeat count lives in object->dup_count. + * + * dedup_record() must run outside object->lock: xa_store() may take + * mutexes (xa_node slab allocation) which lockdep would flag against the + * raw spinlock object->lock. + */ +static void dedup_record(struct xarray *dedup, struct kmemleak_object *object, + depot_stack_handle_t trace_handle) +{ + struct kmemleak_object *rep; + void *old; + + /* + * No stack trace to dedup against: early-boot allocation tracked + * before kmemleak_init() set up object_cache, or stack_depot_save() + * failure under memory pressure. + */ + if (!trace_handle) { + print_leak_locked(object, true); + return; + } + + /* stack is available, now we can de-dup */ + rep = xa_load(dedup, trace_handle); + if (rep) { + rep->dup_count++; + return; + } + + /* + * Object is being torn down (use_count already hit zero); the + * tracked memory at object->pointer is unsafe to read, so skip. + */ + if (!get_object(object)) + return; + + object->dup_count = 1; + old = xa_store(dedup, trace_handle, object, GFP_ATOMIC); + if (xa_is_err(old)) { + /* xa_node allocation failed; fall back to inline print. */ + print_leak_locked(object, true); + put_object(object); + return; + } + /* + * scan_mutex serialises all writers to the dedup xarray, so xa_store() + * after a NULL xa_load() must always overwrite an empty slot. + */ + WARN_ON_ONCE(old); +} + +/* + * Drain the dedup table. Re-acquires object->lock and re-checks + * OBJECT_ALLOCATED before printing: while get_object() pins the + * kmemleak_object metadata, the underlying tracked allocation may have + * been freed since the scan walked it (kmemleak_free clears + * OBJECT_ALLOCATED under object->lock before the user memory goes away). + * The hex dump is skipped for coalesced entries since the bytes would + * differ across objects anyway. + */ +static void dedup_flush(struct xarray *dedup) +{ + struct kmemleak_object *object; + unsigned long idx; + unsigned int dup; + bool coalesced; + + xa_for_each(dedup, idx, object) { + dup = object->dup_count; + coalesced = dup > 1; + + print_leak_locked(object, !coalesced); + if (coalesced) + pr_warn(" ... and %u more object(s) with the same backtrace\n", + dup - 1); + put_object(object); + xa_erase(dedup, idx); + } +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1694,6 +1802,7 @@ static void kmemleak_scan(void) struct kmemleak_object *object; struct zone *zone; int __maybe_unused i; + struct xarray dedup; int new_leaks = 0; jiffies_last_scan = jiffies; @@ -1834,10 +1943,18 @@ static void kmemleak_scan(void) return; /* - * Scanning result reporting. + * Scanning result reporting. When verbose printing is enabled, dedupe + * by stackdepot trace_handle so each unique backtrace is logged once + * per scan, annotated with the number of objects that share it. The + * per-leak count below still reflects every object, and + * /sys/kernel/debug/kmemleak still lists them individually. */ + xa_init(&dedup); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + depot_stack_handle_t trace_handle; + bool dedup_print; + if (need_resched()) kmemleak_cond_resched(object); @@ -1849,18 +1966,33 @@ static void kmemleak_scan(void) if (!color_white(object)) continue; raw_spin_lock_irq(&object->lock); + trace_handle = 0; + dedup_print = false; if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; - - if (kmemleak_verbose) - print_unreferenced(NULL, object); - + if (kmemleak_verbose) { + trace_handle = object->trace_handle; + dedup_print = true; + } new_leaks++; } raw_spin_unlock_irq(&object->lock); + + /* + * Defer the verbose print outside object->lock: xa_store() + * may take xa_node slab locks at a higher wait-context level + * which lockdep would flag against the raw_spinlock_t + * object->lock. rcu_read_lock() keeps the kmemleak_object + * alive across the call. + */ + if (dedup_print) + dedup_record(&dedup, object, trace_handle); } rcu_read_unlock(); + /* Flush'em all */ + dedup_flush(&dedup); + xa_destroy(&dedup); if (new_leaks) { kmemleak_found_leaks = true; diff --git a/mm/list_lru.c b/mm/list_lru.c index 9bf7f524796b..36662d02ff96 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,6 +15,28 @@ #include "slab.h" #include "internal.h" +static inline void lock_list_lru(struct list_lru_one *l, bool irq, + unsigned long *irq_flags) +{ + if (irq_flags) + spin_lock_irqsave(&l->lock, *irq_flags); + else if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); +} + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off, + unsigned long *irq_flags) +{ + if (irq_flags) + spin_unlock_irqrestore(&l->lock, *irq_flags); + else if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} + #ifdef CONFIG_MEMCG static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -60,34 +82,23 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) return &lru->node[nid].lru; } -static inline bool lock_list_lru(struct list_lru_one *l, bool irq) -{ - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); - if (unlikely(READ_ONCE(l->nr_items) == LONG_MIN)) { - if (irq) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); - return false; - } - return true; -} - static inline struct list_lru_one * -lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, - bool irq, bool skip_empty) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, bool irq, + unsigned long *irq_flags, bool skip_empty) { struct list_lru_one *l; rcu_read_lock(); again: - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - if (likely(l) && lock_list_lru(l, irq)) { - rcu_read_unlock(); - return l; + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(*memcg)); + if (likely(l)) { + lock_list_lru(l, irq, irq_flags); + if (likely(READ_ONCE(l->nr_items) != LONG_MIN)) { + rcu_read_unlock(); + return l; + } + unlock_list_lru(l, irq, irq_flags); } /* * Caller may simply bail out if raced with reparenting or @@ -97,18 +108,10 @@ again: rcu_read_unlock(); return NULL; } - VM_WARN_ON(!css_is_dying(&memcg->css)); - memcg = parent_mem_cgroup(memcg); + VM_WARN_ON(!css_is_dying(&(*memcg)->css)); + *memcg = parent_mem_cgroup(*memcg); goto again; } - -static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) -{ - if (irq_off) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); -} #else static void list_lru_register(struct list_lru *lru) { @@ -135,52 +138,112 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, - bool irq, bool skip_empty) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, bool irq, + unsigned long *irq_flags, bool skip_empty) { struct list_lru_one *l = &lru->node[nid].lru; - if (irq) - spin_lock_irq(&l->lock); - else - spin_lock(&l->lock); + lock_list_lru(l, irq, irq_flags); return l; } +#endif /* CONFIG_MEMCG */ -static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +struct list_lru_one *list_lru_lock(struct list_lru *lru, int nid, + struct mem_cgroup **memcg) { - if (irq_off) - spin_unlock_irq(&l->lock); - else - spin_unlock(&l->lock); + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/false, + /*irq_flags=*/NULL, /*skip_empty=*/false); } -#endif /* CONFIG_MEMCG */ -/* The caller must ensure the memcg lifetime. */ -bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) +void list_lru_unlock(struct list_lru_one *l) { - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; + unlock_list_lru(l, /*irq_off=*/false, /*irq_flags=*/NULL); +} - l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); - if (!l) - return false; +struct list_lru_one *list_lru_lock_irq(struct list_lru *lru, int nid, + struct mem_cgroup **memcg) +{ + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, + /*irq_flags=*/NULL, /*skip_empty=*/false); +} + +void list_lru_unlock_irq(struct list_lru_one *l) +{ + unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/NULL); +} + +struct list_lru_one *list_lru_lock_irqsave(struct list_lru *lru, int nid, + struct mem_cgroup **memcg, + unsigned long *flags) +{ + return lock_list_lru_of_memcg(lru, nid, memcg, /*irq=*/true, + /*irq_flags=*/flags, /*skip_empty=*/false); +} + +void list_lru_unlock_irqrestore(struct list_lru_one *l, unsigned long *flags) +{ + unlock_list_lru(l, /*irq_off=*/true, /*irq_flags=*/flags); +} + +bool __list_lru_add(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ if (list_empty(item)) { list_add_tail(item, &l->list); - /* Set shrinker bit if the first element was added */ + /* + * Set shrinker bit on the memcg that owns the locked + * sublist - lock_list_lru_of_memcg() may have walked up + * past a dying memcg, and the bit must be set there. + */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - unlock_list_lru(l, false); - atomic_long_inc(&nlru->nr_items); + atomic_long_inc(&lru->node[nid].nr_items); return true; } - unlock_list_lru(l, false); return false; } EXPORT_SYMBOL_GPL(list_lru_add); +bool __list_lru_del(struct list_lru *lru, struct list_lru_one *l, + struct list_head *item, int nid) +{ + if (!list_empty(item)) { + list_del_init(item); + l->nr_items--; + atomic_long_dec(&lru->node[nid].nr_items); + return true; + } + return false; +} + +/* The caller must ensure the memcg lifetime. */ +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ + struct list_lru_one *l; + bool ret; + + l = list_lru_lock(lru, nid, &memcg); + ret = __list_lru_add(lru, l, item, nid, memcg); + list_lru_unlock(l); + return ret; +} + +bool list_lru_add_irq(struct list_lru *lru, struct list_head *item, + int nid, struct mem_cgroup *memcg) +{ + struct list_lru_one *l; + bool ret; + + l = list_lru_lock_irq(lru, nid, &memcg); + ret = __list_lru_add(lru, l, item, nid, memcg); + list_lru_unlock_irq(l); + return ret; +} + bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { bool ret; @@ -202,20 +265,13 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); - if (!l) - return false; - if (!list_empty(item)) { - list_del_init(item); - l->nr_items--; - unlock_list_lru(l, false); - atomic_long_dec(&nlru->nr_items); - return true; - } - unlock_list_lru(l, false); - return false; + bool ret; + + l = list_lru_lock(lru, nid, &memcg); + ret = __list_lru_del(lru, l, item, nid); + list_lru_unlock(l); + return ret; } bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) @@ -288,7 +344,8 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, unsigned long isolated = 0; restart: - l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); + l = lock_list_lru_of_memcg(lru, nid, &memcg, /*irq=*/irq_off, + /*irq_flags=*/NULL, /*skip_empty=*/true); if (!l) return isolated; list_for_each_safe(item, n, &l->list) { @@ -329,7 +386,7 @@ restart: BUG(); } } - unlock_list_lru(l, irq_off); + unlock_list_lru(l, irq_off, NULL); out: return isolated; } @@ -514,17 +571,14 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, return idx < 0 || xa_load(&lru->xa, idx); } -int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, - gfp_t gfp) +static int __memcg_list_lru_alloc(struct mem_cgroup *memcg, + struct list_lru *lru, gfp_t gfp) { unsigned long flags; struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); - if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) - return 0; - gfp &= GFP_RECLAIM_MASK; /* * Because the list_lru can be reparented to the parent cgroup's @@ -565,6 +619,38 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, return xas_error(&xas); } + +int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, + gfp_t gfp) +{ + if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) + return 0; + return __memcg_list_lru_alloc(memcg, lru, gfp); +} + +int folio_memcg_list_lru_alloc(struct folio *folio, struct list_lru *lru, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int res; + + if (!list_lru_memcg_aware(lru)) + return 0; + + /* Fast path when list_lru heads already exist */ + rcu_read_lock(); + memcg = folio_memcg(folio); + res = memcg_list_lru_allocated(memcg, lru); + rcu_read_unlock(); + if (likely(res)) + return 0; + + /* Allocation may block, pin the memcg */ + memcg = get_mem_cgroup_from_folio(folio); + res = __memcg_list_lru_alloc(memcg, lru, gfp); + mem_cgroup_put(memcg); + return res; +} #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { diff --git a/mm/madvise.c b/mm/madvise.c index 69708e953cf5..cd9bb077072c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1834,50 +1834,29 @@ static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) tlb_finish_mmu(madv_behavior->tlb); } -static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +/** + * check_input_range() - Check if the requested range is valid. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * + * Returns: 0 if the input range is valid, otherwise an error code. + */ +static int check_input_range(unsigned long start, size_t len_in) { size_t len; - if (!madvise_behavior_valid(behavior)) - return false; - if (!PAGE_ALIGNED(start)) - return false; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return false; + return -EINVAL; if (start + len < start) - return false; - - return true; -} + return -EINVAL; -/* - * madvise_should_skip() - Return if the request is invalid or nothing. - * @start: Start address of madvise-requested address range. - * @len_in: Length of madvise-requested address range. - * @behavior: Requested madvise behavior. - * @err: Pointer to store an error code from the check. - * - * If the specified behaviour is invalid or nothing would occur, we skip the - * operation. This function returns true in the cases, otherwise false. In - * the former case we store an error on @err. - */ -static bool madvise_should_skip(unsigned long start, size_t len_in, - int behavior, int *err) -{ - if (!is_valid_madvise(start, len_in, behavior)) { - *err = -EINVAL; - return true; - } - if (start + PAGE_ALIGN(len_in) == start) { - *err = 0; - return true; - } - return false; + return 0; } static bool is_madvise_populate(struct madvise_behavior *madv_behavior) @@ -2013,8 +1992,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh .tlb = &tlb, }; - if (madvise_should_skip(start, len_in, behavior, &error)) + if (!madvise_behavior_valid(behavior)) + return -EINVAL; + + error = check_input_range(start, len_in); + if (error || !len_in) return error; + error = madvise_lock(&madv_behavior); if (error) return error; @@ -2056,7 +2040,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, size_t len_in = iter_iov_len(iter); int error; - if (madvise_should_skip(start, len_in, behavior, &error)) + error = check_input_range(start, len_in); + if (error || !len_in) ret = error; else ret = madvise_do_behavior(start, len_in, &madv_behavior); @@ -2131,6 +2116,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + if (!madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 433bba9dfe71..765069211567 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -5,7 +5,6 @@ #include <linux/mm_inline.h> #include <linux/pagewalk.h> #include <linux/backing-dev.h> -#include <linux/swap_cgroup.h> #include <linux/eventfd.h> #include <linux/poll.h> #include <linux/sort.h> @@ -14,6 +13,7 @@ #include "internal.h" #include "swap.h" +#include "swap_table.h" #include "memcontrol-v1.h" /* @@ -603,19 +603,26 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) local_irq_restore(flags); } +#ifdef CONFIG_SWAP /** - * memcg1_swapout - transfer a memsw charge to swap + * __memcg1_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer - * @entry: swap entry to move the charge to + * @ci: the locked swap cluster holding the swap entries + * + * Transfer the memsw charge of @folio to the swap entry stored in + * folio->swap. * - * Transfer the memsw charge of @folio to @entry. + * Context: folio must be isolated, unmapped, locked and is just about to + * be freed, and caller must disable IRQs and hold the swap cluster lock. */ -void memcg1_swapout(struct folio *folio, swp_entry_t entry) +void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci) { struct mem_cgroup *memcg, *swap_memcg; struct obj_cgroup *objcg; unsigned int nr_entries; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); @@ -641,7 +648,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries, + mem_cgroup_private_id(swap_memcg)); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -656,8 +664,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) } /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is + * The caller must hold the swap cluster lock with IRQ off. It is * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ @@ -671,18 +678,24 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) obj_cgroup_put(objcg); } -/* - * memcg1_swapin - uncharge swap slot - * @entry: the first swap entry for which the pages are charged - * @nr_pages: number of pages which will be uncharged +/** + * memcg1_swapin - uncharge swap slot on swapin + * @folio: folio being swapped in * - * Call this function after successfully adding the charged page to swapcache. + * Call this function after successfully adding the charged + * folio to swapcache. * - * Note: This function assumes the page for which swap slot is being uncharged - * is order 0 page. + * Context: The folio has to be in swap cache and locked. */ -void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +void memcg1_swapin(struct folio *folio) { + struct swap_cluster_info *ci; + unsigned long nr_pages; + unsigned short id; + + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -695,15 +708,22 @@ void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account()) { - /* - * The swap entry might not get freed for a long time, - * let's not wait for it. The page already received a - * memory+swap charge, drop the swap entry duplicate. - */ - mem_cgroup_uncharge_swap(entry, nr_pages); - } + if (!do_memsw_account()) + return; + + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + nr_pages = folio_nr_pages(folio); + ci = swap_cluster_get_and_lock(folio); + id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap), + nr_pages); + swap_cluster_unlock(ci); + mem_cgroup_uncharge_swap(id, nr_pages); } +#endif void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1a4fd2504bcd..56cd4af08232 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,7 +54,6 @@ #include <linux/vmpressure.h> #include <linux/memremap.h> #include <linux/mm_inline.h> -#include <linux/swap_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> @@ -64,6 +63,7 @@ #include <linux/sched/isolation.h> #include <linux/kmemleak.h> #include "internal.h" +#include "swap_table.h" #include <net/sock.h> #include <net/ip.h> #include "slab.h" @@ -150,15 +150,15 @@ static void obj_cgroup_release(struct percpu_ref *ref) * However, it can be PAGE_SIZE or (x * PAGE_SIZE). * * The following sequence can lead to it: - * 1) CPU0: objcg == stock->cached_objcg + * 1) CPU0: objcg cached in one of stock->cached[i] * 2) CPU1: we do a small allocation (e.g. 92 bytes), * PAGE_SIZE bytes are charged * 3) CPU1: a process from another memcg is allocating something, * the stock if flushed, * objcg->nr_charged_bytes = PAGE_SIZE - 92 - * 5) CPU0: we do release this object, - * 92 bytes are added to stock->nr_bytes - * 6) CPU0: stock is flushed, + * 4) CPU0: we do release this object, + * 92 bytes are added to stock->nr_bytes[i] + * 5) CPU0: stock is flushed, * 92 bytes are added to objcg->nr_charged_bytes * * In the result, nr_charged_bytes == PAGE_SIZE. @@ -2018,24 +2018,49 @@ static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { .lock = INIT_LOCAL_TRYLOCK(lock), }; +/* + * NR_OBJ_STOCK is sized so the entire hot path of obj_stock_pcp + * (lock, accounting metadata, nr_bytes[] and cached[]) fits within a + * single 64-byte cache line on non-debug 64-bit builds. With 5 slots: + * lock(1) + index(1) + node_id(2) + slab stats(4) + nr_bytes(10) + * + pad(6) + cached(40) == 64 bytes. + * A CPU can thus consume/refill/account against five different objcgs + * (typically per-node variants of the same memcg) while incurring at + * most one cache miss on the stock. + */ +#define NR_OBJ_STOCK 5 struct obj_stock_pcp { local_trylock_t lock; - unsigned int nr_bytes; - struct obj_cgroup *cached_objcg; - struct pglist_data *cached_pgdat; - int nr_slab_reclaimable_b; - int nr_slab_unreclaimable_b; + int8_t index; + int16_t node_id; + int16_t nr_slab_reclaimable_b; + int16_t nr_slab_unreclaimable_b; +#if PAGE_SHIFT > 16 + /* + * On rare archs with 256KiB base page size (hexagon and powerpc 44x) + * keep nr_bytes to unsigned int as uint16_t cannot represent the full +e patches/memcg-uint16_t-for-nr_bytes-in-obj_stock_pcp.patch * sub-page remainder. Such archs are not cacheline optimization target. + */ + unsigned int nr_bytes[NR_OBJ_STOCK]; +#else + uint16_t nr_bytes[NR_OBJ_STOCK]; +#endif + struct obj_cgroup *cached[NR_OBJ_STOCK]; struct work_struct work; unsigned long flags; + uint8_t drain_idx; }; static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { .lock = INIT_LOCAL_TRYLOCK(lock), + .index = -1, + .node_id = NUMA_NO_NODE, }; static DEFINE_MUTEX(percpu_charge_mutex); +static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i); static void drain_obj_stock(struct obj_stock_pcp *stock); static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); @@ -3155,54 +3180,73 @@ static void unlock_stock(struct obj_stock_pcp *stock) local_unlock(&obj_stock.lock); } -/* Call after __refill_obj_stock() to ensure stock->cached_objg == objcg */ +/* Call after __refill_obj_stock() so a slot for objcg exists in the stock */ static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { - int *bytes; + int16_t *bytes; + int i; - if (!stock || READ_ONCE(stock->cached_objcg) != objcg) + /* + * Though at the moment MAX_NUMNODES <= 1024 in all archs but let's make + * sure it does not exceed S16_MAX otherwise we need to fix node_id type + * in struct obj_stock_pcp. + */ + BUILD_BUG_ON(MAX_NUMNODES >= S16_MAX); + + if (!stock) + goto direct; + + for (i = 0; i < NR_OBJ_STOCK; ++i) { + if (READ_ONCE(stock->cached[i]) == objcg) + break; + } + if (i == NR_OBJ_STOCK) goto direct; /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat changes. + * accumulating over a page of vmstat data or when the objcg slot or + * pgdat the stats belong to changes. */ - if (stock->cached_pgdat != pgdat) { - /* Flush the existing cached vmstat data */ - struct pglist_data *oldpg = stock->cached_pgdat; + if (stock->index < 0) { + stock->index = i; + stock->node_id = pgdat->node_id; + } else if (stock->index != i || stock->node_id != pgdat->node_id) { + struct obj_cgroup *old = READ_ONCE(stock->cached[stock->index]); + struct pglist_data *oldpg = NODE_DATA(stock->node_id); if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } - stock->cached_pgdat = pgdat; + stock->index = i; + stock->node_id = pgdat->node_id; } bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b : &stock->nr_slab_unreclaimable_b; + /* - * Even for large object >= PAGE_SIZE, the vmstat data will still be - * cached locally at least once before pushing it out. + * Fold @nr into the cached value and decide whether to keep it cached + * or flush it directly. Cache the combined value when it fits in the + * int16_t storage and either the cache was empty (so even a value + * above PAGE_SIZE gets a chance to be canceled by a paired delta) or + * the combined value is within the PAGE_SIZE flush threshold. */ - if (!*bytes) { + nr += *bytes; + if (abs(nr) <= S16_MAX && (!*bytes || abs(nr) <= PAGE_SIZE)) { *bytes = nr; nr = 0; } else { - *bytes += nr; - if (abs(*bytes) > PAGE_SIZE) { - nr = *bytes; - *bytes = 0; - } else { - nr = 0; - } + *bytes = 0; } direct: if (nr) @@ -3213,10 +3257,16 @@ static bool __consume_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, unsigned int nr_bytes) { - if (objcg == READ_ONCE(stock->cached_objcg) && - stock->nr_bytes >= nr_bytes) { - stock->nr_bytes -= nr_bytes; - return true; + int i; + + for (i = 0; i < NR_OBJ_STOCK; ++i) { + if (READ_ONCE(stock->cached[i]) != objcg) + continue; + if (stock->nr_bytes[i] >= nr_bytes) { + stock->nr_bytes[i] -= nr_bytes; + return true; + } + return false; } return false; @@ -3237,16 +3287,42 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) return ret; } -static void drain_obj_stock(struct obj_stock_pcp *stock) +/* Flush the cached slab stats (if any) back to their owning objcg/pgdat. */ +static void drain_obj_stock_stats(struct obj_stock_pcp *stock) { - struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); + struct obj_cgroup *old; + struct pglist_data *oldpg; + + if (stock->index < 0) + return; + + old = READ_ONCE(stock->cached[stock->index]); + oldpg = NODE_DATA(stock->node_id); + + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(old, oldpg, NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(old, oldpg, NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->index = -1; + stock->node_id = NUMA_NO_NODE; +} + +static void drain_obj_stock_slot(struct obj_stock_pcp *stock, int i) +{ + struct obj_cgroup *old = READ_ONCE(stock->cached[i]); if (!old) return; - if (stock->nr_bytes) { - unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; - unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + if (stock->nr_bytes[i]) { + unsigned int nr_pages = stock->nr_bytes[i] >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes[i] & (PAGE_SIZE - 1); if (nr_pages) { struct mem_cgroup *memcg; @@ -3272,44 +3348,43 @@ static void drain_obj_stock(struct obj_stock_pcp *stock) * so it might be changed in the future. */ atomic_add(nr_bytes, &old->nr_charged_bytes); - stock->nr_bytes = 0; + stock->nr_bytes[i] = 0; } - /* - * Flush the vmstat data in current stock - */ - if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { - if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, - NR_SLAB_RECLAIMABLE_B, - stock->nr_slab_reclaimable_b); - stock->nr_slab_reclaimable_b = 0; - } - if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, - NR_SLAB_UNRECLAIMABLE_B, - stock->nr_slab_unreclaimable_b); - stock->nr_slab_unreclaimable_b = 0; - } - stock->cached_pgdat = NULL; - } + /* Flush vmstat data when its owning slot is being drained. */ + if (stock->index == i) + drain_obj_stock_stats(stock); - WRITE_ONCE(stock->cached_objcg, NULL); + WRITE_ONCE(stock->cached[i], NULL); obj_cgroup_put(old); } +static void drain_obj_stock(struct obj_stock_pcp *stock) +{ + int i; + + for (i = 0; i < NR_OBJ_STOCK; ++i) + drain_obj_stock_slot(stock, i); +} + static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { - struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); + struct obj_cgroup *objcg; struct mem_cgroup *memcg; bool flush = false; + int i; rcu_read_lock(); - if (objcg) { + for (i = 0; i < NR_OBJ_STOCK; ++i) { + objcg = READ_ONCE(stock->cached[i]); + if (!objcg) + continue; memcg = obj_cgroup_memcg(objcg); - if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) { flush = true; + break; + } } rcu_read_unlock(); @@ -3322,6 +3397,8 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, bool allow_uncharge) { unsigned int nr_pages = 0; + unsigned int stock_nr_bytes; + int i, slot = -1, empty_slot = -1; if (!stock) { nr_pages = nr_bytes >> PAGE_SHIFT; @@ -3330,21 +3407,52 @@ static void __refill_obj_stock(struct obj_cgroup *objcg, goto out; } - if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - drain_obj_stock(stock); + for (i = 0; i < NR_OBJ_STOCK; ++i) { + struct obj_cgroup *cached = READ_ONCE(stock->cached[i]); + + if (!cached) { + if (empty_slot == -1) + empty_slot = i; + continue; + } + if (cached == objcg) { + slot = i; + break; + } + } + + if (slot == -1) { + slot = empty_slot; + if (slot == -1) { + slot = stock->drain_idx++; + if (stock->drain_idx == NR_OBJ_STOCK) + stock->drain_idx = 0; + drain_obj_stock_slot(stock, slot); + } obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + /* + * Keep the xchg result in the unsigned int local; storing + * it directly into stock->nr_bytes[slot] (uint16_t) would + * silently truncate values >= U16_MAX and bypass the flush + * guard below, leaking page-counter charges. + */ + stock_nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); + WRITE_ONCE(stock->cached[slot], objcg); allow_uncharge = true; /* Allow uncharge when objcg changes */ + } else { + stock_nr_bytes = stock->nr_bytes[slot]; } - stock->nr_bytes += nr_bytes; - if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { - nr_pages = stock->nr_bytes >> PAGE_SHIFT; - stock->nr_bytes &= (PAGE_SIZE - 1); + stock_nr_bytes += nr_bytes; + + if ((allow_uncharge && (stock_nr_bytes > PAGE_SIZE)) || + stock_nr_bytes > U16_MAX) { + nr_pages = stock_nr_bytes >> PAGE_SHIFT; + stock_nr_bytes &= (PAGE_SIZE - 1); } + stock->nr_bytes[slot] = stock_nr_bytes; out: if (nr_pages) @@ -4005,11 +4113,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg1_alloc_events(memcg)) goto fail; + pstatc_pcpu = parent ? parent->vmstats_percpu : NULL; for_each_possible_cpu(cpu) { - if (parent) - pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent_pcpu = parent ? pstatc_pcpu : NULL; + statc->parent_pcpu = pstatc_pcpu; statc->vmstats = memcg->vmstats; } @@ -4037,11 +4144,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); - INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); - memcg->deferred_split_queue.split_queue_len = 0; -#endif lru_gen_init_memcg(memcg); return memcg; fail: @@ -4192,11 +4294,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); memcg_offline_kmem(memcg); - reparent_deferred_split_queue(memcg); /* - * The reparenting of objcg must be after the reparenting of the - * list_lru and deferred_split_queue above, which ensures that they will - * not mistakenly get the parent list_lru and deferred_split_queue. + * The reparenting of objcg must be after the reparenting of + * the list_lru in memcg_offline_kmem(), which ensures that + * they will not mistakenly get the parent list_lru. */ memcg_reparent_objcgs(memcg); reparent_shrinker_deferred(memcg); @@ -5080,27 +5181,25 @@ out: /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. - * @folio: folio to charge. + * @folio: the folio to charge + * @id: memory cgroup id * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the folio is allocated * * This function charges a folio allocated for swapin. Please call this before * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, - gfp_t gfp, swp_entry_t entry) +int mem_cgroup_swapin_charge_folio(struct folio *folio, unsigned short id, + struct mm_struct *mm, gfp_t gfp) { struct mem_cgroup *memcg; - unsigned short id; int ret; if (mem_cgroup_disabled()) return 0; - id = lookup_swap_cgroup_id(entry); rcu_read_lock(); memcg = mem_cgroup_from_private_id(id); if (!memcg || !css_tryget_online(&memcg->css)) @@ -5474,15 +5573,15 @@ int __init mem_cgroup_init(void) /** * __mem_cgroup_try_charge_swap - try charging swap space for a folio * @folio: folio being added to swap - * @entry: swap entry to charge * - * Try to charge @folio's memcg for the swap space at @entry. + * Try to charge @folio's memcg for the swap space at folio->swap. * * Returns 0 on success, -ENOMEM on failure. */ -int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio) { unsigned int nr_pages = folio_nr_pages(folio); + struct swap_cluster_info *ci; struct page_counter *counter; struct mem_cgroup *memcg; struct obj_cgroup *objcg; @@ -5497,7 +5596,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - if (!entry.val) { + if (!folio_test_swapcache(folio)) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); rcu_read_unlock(); return 0; @@ -5516,22 +5615,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) } mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); + ci = swap_cluster_get_and_lock(folio); + __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages, + mem_cgroup_private_id(memcg)); + swap_cluster_unlock(ci); return 0; } /** * __mem_cgroup_uncharge_swap - uncharge swap space - * @entry: swap entry to uncharge + * @id: cgroup id to uncharge * @nr_pages: the amount of swap space to uncharge */ -void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages) { struct mem_cgroup *memcg; - unsigned short id; - id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_private_id(id); if (memcg) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d47aef256a32..51508a55c405 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -172,23 +172,11 @@ static int __page_handle_poison(struct page *page) { int ret; - /* - * zone_pcp_disable() can't be used here. It will - * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold - * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap - * optimization is enabled. This will break current lock dependency - * chain and leads to deadlock. - * Disabling pcp before dissolving the page was a deterministic - * approach because we made sure that those pages cannot end up in any - * PCP list. Draining PCP lists expels those pages to the buddy system, - * but nothing guarantees that those pages do not get back to a PCP - * queue if we need to refill those. - */ + zone_pcp_disable(page_zone(page)); ret = dissolve_free_hugetlb_folio(page_folio(page)); - if (!ret) { - drain_all_pages(page_zone(page)); + if (!ret) ret = take_page_off_buddy(page); - } + zone_pcp_enable(page_zone(page)); return ret; } @@ -459,7 +447,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, * Only do anything when FORCEKILL is set, otherwise just free the * list (this is used for clean pages which do not need killing) */ -static void kill_procs(struct list_head *to_kill, int forcekill, +static void kill_procs(struct list_head *to_kill, bool forcekill, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -1418,7 +1406,7 @@ try_again: * We raced with (possibly temporary) unhandlable * page, retry. */ - if (pass++ < 3) { + if (pass++ < GET_PAGE_MAX_RETRY_NUM) { shake_page(p); goto try_again; } @@ -1582,7 +1570,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, { LIST_HEAD(tokill); bool unmap_success; - int forcekill; + bool forcekill; bool mlocked = folio_test_mlocked(folio); /* @@ -1703,7 +1691,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, unmap_mapping_range(mapping, start, size, 0); } - kill_procs(to_kill, flags & MF_MUST_KILL, pfn, flags); + kill_procs(to_kill, !!(flags & MF_MUST_KILL), pfn, flags); } /* @@ -2027,13 +2015,14 @@ out_unlock: * So some of prechecks for hwpoison (pinning, and testing/setting * PageHWPoison) should be done in single hugetlb_lock range. * Returns: - * 0 - not hugetlb, or recovered + * 0 - recovered + * -ENOENT - no hugetlb page * -EBUSY - not recovered * -EOPNOTSUPP - hwpoison_filter'ed * -EHWPOISON - folio or exact page already poisoned * -EFAULT - kill_accessing_process finds current->mm null */ -static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +static int try_memory_failure_hugetlb(unsigned long pfn, int flags) { int res, rv; struct page *p = pfn_to_page(pfn); @@ -2041,13 +2030,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb unsigned long page_flags; bool migratable_cleared = false; - *hugetlb = 1; retry: res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); switch (res) { case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */ - *hugetlb = 0; - return 0; + return -ENOENT; case MF_HUGETLB_RETRY: if (!(flags & MF_NO_RETRY)) { flags |= MF_NO_RETRY; @@ -2108,9 +2095,9 @@ retry: } #else -static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags) { - return 0; + return -ENOENT; } static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) @@ -2348,7 +2335,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -2387,8 +2373,11 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); - if (hugetlb) + res = try_memory_failure_hugetlb(pfn, flags); + /* + * -ENOENT means the page we found is not hugetlb, so proceed with normal page handling + */ + if (res != -ENOENT) goto unlock_mutex; if (TestSetPageHWPoison(p)) { diff --git a/mm/memory.c b/mm/memory.c index 86a973119bd4..56be920c56d7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3837,8 +3837,8 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * - * Called with mmap_lock locked and the old page referenced, but - * without the ptl held. + * Called with either the VMA lock or the mmap_lock held (see FAULT_FLAG_VMA_LOCK) + * and the old page referenced, but without the ptl held. * * High level logic flow: * @@ -4237,9 +4237,9 @@ static bool wp_can_reuse_anon_folio(struct folio *folio, * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK) and pte both mapped and locked. We return with + * the same lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) @@ -4609,35 +4609,13 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static struct folio *__alloc_swap_folio(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - softleaf_t entry; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); - if (!folio) - return NULL; - - entry = softleaf_from_pte(vmf->orig_pte); - if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - GFP_KERNEL, entry)) { - folio_put(folio); - return NULL; - } - - return folio; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Check if the PTEs within a range are contiguous swap entries - * and have consistent swapcache, zeromap. + * Check if the PTEs within a range are contiguous swap entries. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - softleaf_t entry; int idx; pte_t pte; @@ -4647,20 +4625,13 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = softleaf_from_pte(pte); - if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) - return false; - /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ - if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) - return false; - if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; - return true; } @@ -4687,16 +4658,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, return orders; } -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; - struct folio *folio; unsigned long addr; softleaf_t entry; spinlock_t *ptl; pte_t *pte; - gfp_t gfp; int order; /* @@ -4704,7 +4673,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) - goto fallback; + return 0; /* * A large swapped out folio could be partially or fully in zswap. We @@ -4712,7 +4681,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * folio. */ if (!zswap_never_enabled()) - goto fallback; + return 0; entry = softleaf_from_pte(vmf->orig_pte); /* @@ -4726,12 +4695,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) vmf->address, orders); if (!orders) - goto fallback; + return 0; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) - goto fallback; + return 0; /* * For do_swap_page, find the highest order where the aligned range is @@ -4747,29 +4716,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) pte_unmap_unlock(pte, ptl); - /* Try allocating the highest of the remaining orders. */ - gfp = vma_thp_gfp_mask(vma); - while (orders) { - addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - gfp, entry)) - return folio; - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); - folio_put(folio); - } - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); - order = next_order(&orders, order); - } - -fallback: - return __alloc_swap_folio(vmf); + return orders; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suitable_orders(struct vm_fault *vmf) { - return __alloc_swap_folio(vmf); + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4785,12 +4737,12 @@ static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * - * We return with the mmap_lock locked or unlocked in the same cases - * as does filemap_fault(). + * When returning, the lock may have been released in the same cases + * as done by filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { @@ -4875,23 +4827,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (folio) swap_update_readahead(folio, vma, vmf->address); if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio = alloc_swap_folio(vmf); - if (folio) { - /* - * folio is charged, so swapin can only fail due - * to raced swapin and return NULL. - */ - swapcache = swapin_folio(entry, folio); - if (swapcache != folio) - folio_put(folio); - folio = swapcache; - } - } else { + /* Swapin bypasses readahead for SWP_SYNCHRONOUS_IO devices */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE, + thp_swapin_suitable_orders(vmf) | BIT(0), + vmf, NULL, 0); + else folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - } - if (!folio) { + if (IS_ERR_OR_NULL(folio)) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -4901,6 +4845,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; + folio = NULL; goto unlock; } @@ -5270,24 +5215,28 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { - count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - folio_put(folio); - goto next; - } - folio_throttle_swaprate(folio, gfp); - /* - * When a folio is not zeroed during allocation - * (__GFP_ZERO not used) or user folios require special - * handling, folio_zero_user() is used to make sure - * that the page corresponding to the faulting address - * will be hot in the cache after zeroing. - */ - if (user_alloc_needs_zeroing()) - folio_zero_user(folio, vmf->address); - return folio; + if (!folio) + goto next; + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + folio_put(folio); + goto next; } + if (order > 1 && folio_memcg_alloc_deferred(folio)) { + folio_put(folio); + goto fallback; + } + folio_throttle_swaprate(folio, gfp); + /* + * When a folio is not zeroed during allocation + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. + */ + if (user_alloc_needs_zeroing()) + folio_zero_user(folio, vmf->address); + return folio; next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); @@ -5330,9 +5279,10 @@ static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_lock still held, but pte unmapped and unlocked. + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK), and pte unmapped and unlocked. + * We return with the lock still held, but pte unmapped and unlocked. + * If VM_FAULT_RETRY is returned, the lock may have been released. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { @@ -5440,9 +5390,10 @@ oom: } /* - * The mmap_lock must have been held on entry, and may have been - * released depending on flags and vma->vm_ops->fault() return value. - * See filemap_fault() and __lock_page_retry(). + * Either the VMA lock or the mmap_lock must have been held on entry + * (see FAULT_FLAG_VMA_LOCK) and may have been released depending on + * flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { @@ -5451,18 +5402,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) vm_fault_t ret; /* - * Preallocate pte before we take page_lock because this might lead to - * deadlocks for memcg reclaim which waits for pages under writeback: - * lock_page(A) - * SetPageWriteback(A) - * unlock_page(A) - * lock_page(B) - * lock_page(B) + * Preallocate pte before we take folio lock because this might lead to + * deadlocks for memcg reclaim which waits for folios under writeback: + * folio_lock(A) + * folio_set_writeback(A) + * folio_unlock(A) + * folio_lock(B) + * folio_lock(B) * pte_alloc_one * shrink_folio_list - * wait_on_page_writeback(A) - * SetPageWriteback(B) - * unlock_page(B) + * folio_wait_writeback(A) + * folio_set_writeback(B) + * folio_unlock(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { @@ -5480,7 +5431,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { - if (page_mapped(vmf->page)) + if (folio_mapped(folio)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) @@ -6003,11 +5954,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) } /* - * We enter with non-exclusive mmap_lock (to exclude vma changes, - * but allow concurrent faults). - * The mmap_lock may have been released depending on flags and our + * We enter with either the VMA lock or the mmap_lock held (see + * FAULT_FLAG_VMA_LOCK). + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). - * If mmap_lock is released, vma may become invalid (for example + * If the lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) @@ -6374,10 +6325,11 @@ static void fix_spurious_fault(struct vm_fault *vmf, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow - * concurrent faults). + * On entry, we hold either the VMA lock or the mmap_lock + * (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our return value. + * The mmap_lock or VMA lock may have been released depending on flags + * and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) @@ -6458,8 +6410,8 @@ unlock: /* * On entry, we hold either the VMA lock or the mmap_lock - * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in - * the result, the mmap_lock is not held on exit. See filemap_fault() + * (see FAULT_FLAG_VMA_LOCK). If VM_FAULT_RETRY is set in + * the result, the lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, @@ -6691,9 +6643,9 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, /* * By the time we get here, we already hold either the VMA lock or the - * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). + * mmap_lock (see FAULT_FLAG_VMA_LOCK). * - * The mmap_lock may have been released depending on flags and our + * The lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 40c7915dabe0..7ac19fab2263 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -576,6 +576,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used + * @pgmap: device page map or %NULL if not ZONE_DEVICE * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -583,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone, * calling offline_pages(). */ void __remove_pages(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; @@ -598,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - sparse_remove_section(pfn, cur_nr_pages, altmap); + sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap); } } @@ -1402,6 +1403,12 @@ bool mhp_supports_memmap_on_memory(void) } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); +static void altmap_free(struct vmem_altmap *altmap) +{ + WARN_ONCE(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); +} + static void remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); @@ -1416,22 +1423,17 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) struct vmem_altmap *altmap = NULL; struct memory_block *mem; - mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); + mem = memory_block_get(phys_to_block_id(cur_start)); if (WARN_ON_ONCE(!mem)) continue; altmap = mem->altmap; mem->altmap = NULL; - /* drop the ref. we got via find_memory_block() */ - put_device(&mem->dev); + memory_block_put(mem); remove_memory_block_devices(cur_start, memblock_size); - - arch_remove_memory(cur_start, memblock_size, altmap); - - /* Verify that all vmemmap pages have actually been freed. */ - WARN(altmap->alloc, "Altmap not fully unmapped"); - kfree(altmap); + arch_remove_memory(cur_start, memblock_size, altmap, NULL); + altmap_free(altmap); } } @@ -1462,7 +1464,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, /* call arch's memory hotadd */ ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); if (ret < 0) { - kfree(params.altmap); + altmap_free(params.altmap); goto out; } @@ -1470,8 +1472,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { - arch_remove_memory(cur_start, memblock_size, NULL); - kfree(params.altmap); + arch_remove_memory(cur_start, memblock_size, params.altmap, NULL); + altmap_free(params.altmap); goto out; } } @@ -1556,7 +1558,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { - arch_remove_memory(start, size, params.altmap); + arch_remove_memory(start, size, params.altmap, NULL); goto error; } } @@ -2268,7 +2270,7 @@ static int try_remove_memory(u64 start, u64 size) * No altmaps present, do the removal directly */ remove_memory_block_devices(start, size); - arch_remove_memory(start, size, NULL); + arch_remove_memory(start, size, NULL, NULL); } else { /* all memblocks in the range have altmaps */ remove_memory_blocks_and_altmaps(start, size); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4e4421b22b59..36699fabd3c2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2865,7 +2865,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: - return !!nodes_equal(a->nodes, b->nodes); + return nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: diff --git a/mm/memremap.c b/mm/memremap.c index 053842d45cb1..81766d822400 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { __remove_pages(PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), NULL); + PHYS_PFN(range_len(range)), NULL, pgmap); } else { arch_remove_memory(range->start, range_len(range), - pgmap_altmap(pgmap)); + pgmap_altmap(pgmap), pgmap); kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); diff --git a/mm/migrate.c b/mm/migrate.c index 8a64291ab5b4..d9b23909d716 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1135,26 +1135,24 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, * This is safe because nobody is using it except us. */ enum { - PAGE_WAS_MAPPED = BIT(0), - PAGE_WAS_MLOCKED = BIT(1), - PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, + FOLIO_WAS_MAPPED = BIT(0), + FOLIO_WAS_MLOCKED = BIT(1), + FOLIO_OLD_STATES = FOLIO_WAS_MAPPED | FOLIO_WAS_MLOCKED, }; static void __migrate_folio_record(struct folio *dst, - int old_page_state, - struct anon_vma *anon_vma) + int old_folio_state, struct anon_vma *anon_vma) { - dst->private = (void *)anon_vma + old_page_state; + dst->private = (void *)anon_vma + old_folio_state; } static void __migrate_folio_extract(struct folio *dst, - int *old_page_state, - struct anon_vma **anon_vmap) + int *old_folio_state, struct anon_vma **anon_vmap) { unsigned long private = (unsigned long)dst->private; - *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); - *old_page_state = private & PAGE_OLD_STATES; + *anon_vmap = (struct anon_vma *)(private & ~FOLIO_OLD_STATES); + *old_folio_state = private & FOLIO_OLD_STATES; dst->private = NULL; } @@ -1209,7 +1207,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, { struct folio *dst; int rc = -EAGAIN; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool locked = false; bool dst_locked = false; @@ -1253,12 +1251,12 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, } locked = true; if (folio_test_mlocked(src)) - old_page_state |= PAGE_WAS_MLOCKED; + old_folio_state |= FOLIO_WAS_MLOCKED; if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it - * necessary to wait for PageWriteback. In the async case, + * necessary to wait for writeback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ @@ -1302,7 +1300,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, dst_locked = true; if (unlikely(page_has_movable_ops(&src->page))) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1328,11 +1326,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); - old_page_state |= PAGE_WAS_MAPPED; + old_folio_state |= FOLIO_WAS_MAPPED; } if (!folio_mapped(src)) { - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return 0; } @@ -1344,7 +1342,7 @@ out: if (rc == -EAGAIN) ret = NULL; - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); @@ -1358,13 +1356,13 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct list_head *ret) { int rc; - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; bool src_deferred_split = false; bool src_partially_mapped = false; struct list_head *prev; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); @@ -1404,10 +1402,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); - if (old_page_state & PAGE_WAS_MLOCKED) + if (old_folio_state & FOLIO_WAS_MLOCKED) lru_add_drain(); - if (old_page_state & PAGE_WAS_MAPPED) + if (old_folio_state & FOLIO_WAS_MAPPED) remove_migration_ptes(src, dst, 0); out_unlock_both: @@ -1439,11 +1437,11 @@ out: */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); - __migrate_folio_record(dst, old_page_state, anon_vma); + __migrate_folio_record(dst, old_folio_state, anon_vma); return rc; } - migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + migrate_folio_undo_src(src, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); @@ -1777,11 +1775,11 @@ static void migrate_folios_undo(struct list_head *src_folios, dst = list_first_entry(dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, src_folios, lru) { - int old_page_state = 0; + int old_folio_state = 0; struct anon_vma *anon_vma = NULL; - __migrate_folio_extract(dst, &old_page_state, &anon_vma); - migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + __migrate_folio_extract(dst, &old_folio_state, &anon_vma); + migrate_folio_undo_src(folio, old_folio_state & FOLIO_WAS_MAPPED, anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); @@ -2557,24 +2555,29 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) } task = find_get_task_by_vpid(pid); - if (!task) { + if (!task) return ERR_PTR(-ESRCH); - } + if (down_read_killable(&task->signal->exec_update_lock)) { + mm = ERR_PTR(-EINTR); + goto out; + } /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { mm = ERR_PTR(-EPERM); - goto out; + goto unlock; } mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) - goto out; + goto unlock; *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); +unlock: + up_read(&task->signal->exec_update_lock); out: put_task_struct(task); if (!mm) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 19cd14b34114..554754eb26ff 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -801,8 +801,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, bool flush = false; unsigned long i; - VM_WARN_ON_FOLIO(!folio, folio); - VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp)); + VM_WARN_ON_ONCE(!folio); if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER)) return -EINVAL; @@ -859,11 +858,9 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, if (userfaultfd_missing(vma)) goto unlock_abort; - if (!pmd_none(*pmdp)) { - if (!is_huge_zero_pmd(*pmdp)) - goto unlock_abort; + if (is_huge_zero_pmd(*pmdp)) flush = true; - } else if (!pmd_none(*pmdp)) + else if (!pmd_none(*pmdp)) goto unlock_abort; add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); diff --git a/mm/mm_init.c b/mm/mm_init.c index dc5d93125cdd..65623f95bec3 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -674,6 +674,20 @@ static inline void fixup_hashdist(void) static inline void fixup_hashdist(void) {} #endif /* CONFIG_NUMA */ +#ifdef CONFIG_ZONE_DEVICE +static __meminit void pageblock_migratetype_init_range(unsigned long pfn, + unsigned long nr_pages, int migratetype) +{ + const unsigned long end = pfn + nr_pages; + + for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) { + init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + } +} +#endif + /* * Initialize a reserved page unconditionally, finding its zone first. */ @@ -1012,21 +1026,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, page->zone_device_data = NULL; /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (pageblock_aligned(pfn)) { - init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); - cond_resched(); - } - - /* * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released * directly to the driver page allocator which will set the page count * to 1 when allocating the page. @@ -1056,10 +1055,17 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * of how the sparse_vmemmap internals handle compound pages in the lack * of an altmap. See vmemmap_populate_compound_pages(). */ -static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, +static inline unsigned long compound_nr_pages(unsigned long pfn, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - if (!vmemmap_can_optimize(altmap, pgmap)) + /* + * If DAX memory is hot-plugged into an unoccupied subsection + * of an early section, the unoptimized boot memmap is reused. + * See section_activate(). + */ + if (early_section(__pfn_to_section(pfn)) || + !vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); @@ -1122,13 +1128,18 @@ void __ref memmap_init_zone_device(struct zone *zone, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) + cond_resched(); + if (pfns_per_compound == 1) continue; memmap_init_compound(page, pfn, zone_idx, nid, pgmap, - compound_nr_pages(altmap, pgmap)); + compound_nr_pages(pfn, altmap, pgmap)); } + pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE); + pr_debug("%s initialised %lu pages in %ums\n", __func__, nr_pages, jiffies_to_msecs(jiffies - start)); } @@ -1362,19 +1373,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void pgdat_init_split_queue(struct pglist_data *pgdat) -{ - struct deferred_split *ds_queue = &pgdat->deferred_split_queue; - - spin_lock_init(&ds_queue->split_queue_lock); - INIT_LIST_HEAD(&ds_queue->split_queue); - ds_queue->split_queue_len = 0; -} -#else -static void pgdat_init_split_queue(struct pglist_data *pgdat) {} -#endif - #ifdef CONFIG_COMPACTION static void pgdat_init_kcompactd(struct pglist_data *pgdat) { @@ -1390,8 +1388,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_resize_init(pgdat); pgdat_kswapd_lock_init(pgdat); - - pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); init_waitqueue_head(&pgdat->kswapd_wait); @@ -1418,11 +1414,14 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, static void __meminit zone_init_free_lists(struct zone *zone) { - unsigned int order, t; - for_each_migratetype_order(order, t) { - INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + struct list_head *list; + unsigned int order; + + for_each_free_list(list, zone, order) + INIT_LIST_HEAD(list); + + for (order = 0; order < NR_PAGE_ORDERS; order++) zone->free_area[order].nr_free = 0; - } #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); diff --git a/mm/mmap.c b/mm/mmap.c index 5754d1c36462..2311ae7c2ff4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -504,7 +504,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * A locked or stack area makes no sense to be droppable. * diff --git a/mm/mseal.c b/mm/mseal.c index e2093ae3d25c..9781647483d1 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -8,6 +8,7 @@ */ #include <linux/mempolicy.h> +#include <linux/minmax.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> @@ -65,8 +66,8 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { - const unsigned long curr_start = MAX(vma->vm_start, start); - const unsigned long curr_end = MIN(vma->vm_end, end); + const unsigned long curr_start = max(vma->vm_start, start); + const unsigned long curr_end = min(vma->vm_end, end); if (!vma_test(vma, VMA_SEALED_BIT)) { vma_flags_t vma_flags = vma->flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d49c254174da..f7db8f049bd2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -90,6 +90,9 @@ typedef int __bitwise fpi_t; /* Free the page without taking locks. Rely on trylock only. */ #define FPI_TRYLOCK ((__force fpi_t)BIT(2)) +/* free_pages_prepare() has already been called for page(s) being freed. */ +#define FPI_PREPARED ((__force fpi_t)BIT(3)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -282,6 +285,14 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +/* + * When page allocations stall for longer than a threshold, + * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning + * will be printed during this duration for the entire system. + */ +#define ALLOC_STALL_WARN_MSECS (10 * 1000UL) +static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES; + static bool page_contains_unaccepted(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order, int alloc_flags); @@ -353,7 +364,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); + BUILD_BUG_ON(__MIGRATE_TYPE_END > PAGEBLOCK_MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); @@ -423,10 +434,10 @@ bool get_pfnblock_bit(const struct page *page, unsigned long pfn, * Use get_pfnblock_migratetype() if caller already has both @page and @pfn * to save a call to page_to_pfn(). */ -__always_inline enum migratetype +enum migratetype get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - unsigned long mask = MIGRATETYPE_AND_ISO_MASK; + unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK; unsigned long flags; flags = __get_pfnblock_flags_mask(page, pfn, mask); @@ -435,7 +446,7 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn) if (flags & BIT(PB_migrate_isolate)) return MIGRATE_ISOLATE; #endif - return flags & MIGRATETYPE_MASK; + return flags & PAGEBLOCK_MIGRATETYPE_MASK; } /** @@ -523,11 +534,11 @@ static void set_pageblock_migratetype(struct page *page, } VM_WARN_ONCE(get_pageblock_isolate(page), "Use clear_pageblock_isolate() to unisolate pageblock"); - /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ + /* PAGEBLOCK_ISO_MASK clears PB_migrate_isolate if it is set */ #endif __set_pfnblock_flags_mask(page, page_to_pfn(page), (unsigned long)migratetype, - MIGRATETYPE_AND_ISO_MASK); + PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); } void __meminit init_pageblock_migratetype(struct page *page, @@ -553,7 +564,7 @@ void __meminit init_pageblock_migratetype(struct page *page, flags |= BIT(PB_migrate_isolate); #endif __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, - MIGRATETYPE_AND_ISO_MASK); + PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); } #ifdef CONFIG_DEBUG_VM @@ -639,19 +650,12 @@ out: static inline unsigned int order_to_pindex(int migratetype, int order) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + bool movable = migratetype == MIGRATE_MOVABLE; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool movable; - if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(!is_pmd_order(order)); - - movable = migratetype == MIGRATE_MOVABLE; - - return NR_LOWORDER_PCP_LISTS + movable; + if (order > PAGE_ALLOC_COSTLY_ORDER) + return NR_LOWORDER_PCP_LISTS + movable; } -#else - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -#endif return (MIGRATE_PCPTYPES * order) + migratetype; } @@ -660,12 +664,10 @@ static inline int pindex_to_order(unsigned int pindex) { int order = pindex / MIGRATE_PCPTYPES; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex >= NR_LOWORDER_PCP_LISTS) - order = HPAGE_PMD_ORDER; -#else - VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -#endif + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pindex >= NR_LOWORDER_PCP_LISTS) + order = HPAGE_PMD_ORDER; + } return order; } @@ -1211,14 +1213,18 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +static void clear_highpages_kasan_tagged(struct page *page, int numpages) { - int i; - /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) - clear_highpage_kasan_tagged(page + i); + if (!IS_ENABLED(CONFIG_HIGHMEM)) { + clear_pages(kasan_reset_tag(page_address(page)), numpages); + } else { + int i; + + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); + } kasan_enable_current(); } @@ -1303,8 +1309,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #endif /* CONFIG_MEM_ALLOC_PROFILING */ -__always_inline bool __free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) +static __always_inline bool __free_pages_prepare(struct page *page, + unsigned int order, fpi_t fpi_flags) { int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page); @@ -1312,6 +1318,9 @@ __always_inline bool __free_pages_prepare(struct page *page, bool compound = PageCompound(page); struct folio *folio = page_folio(page); + if (fpi_flags & FPI_PREPARED) + return true; + VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); @@ -1423,7 +1432,7 @@ __always_inline bool __free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1451,7 +1460,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { - unsigned long flags; unsigned int order; struct page *page; @@ -1464,7 +1472,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (count > 0) { struct list_head *list; @@ -1496,8 +1504,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } - - spin_unlock_irqrestore(&zone->lock, flags); } /* Split a multi-block free page into its individual pageblocks. */ @@ -1848,7 +1854,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } /* If memory is still not initialized, initialize it now. */ if (init) - kernel_init_pages(page, 1 << order); + clear_highpages_kasan_tagged(page, 1 << order); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); @@ -2125,15 +2131,15 @@ static bool __move_freepages_block_isolate(struct zone *zone, } move: - /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ + /* Use PAGEBLOCK_MIGRATETYPE_MASK to get non-isolate migratetype */ if (isolate) { from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), - MIGRATETYPE_MASK); + PAGEBLOCK_MIGRATETYPE_MASK); to_mt = MIGRATE_ISOLATE; } else { from_mt = MIGRATE_ISOLATE; to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), - MIGRATETYPE_MASK); + PAGEBLOCK_MIGRATETYPE_MASK); } __move_freepages_block(zone, start_pfn, from_mt, to_mt); @@ -2244,25 +2250,29 @@ static bool should_try_claim_block(unsigned int order, int start_mt) * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claimable) +enum fallback_result +find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool claimable, int *mt_out) { int i; if (claimable && !should_try_claim_block(order, migratetype)) - return -2; + return FALLBACK_NOCLAIM; if (area->nr_free == 0) - return -1; + return FALLBACK_EMPTY; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { int fallback_mt = fallbacks[migratetype][i]; - if (!free_area_empty(area, fallback_mt)) - return fallback_mt; + if (!free_area_empty(area, fallback_mt)) { + if (mt_out) + *mt_out = fallback_mt; + return FALLBACK_FOUND; + } } - return -1; + return FALLBACK_EMPTY; } /* @@ -2372,16 +2382,16 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, */ for (current_order = MAX_PAGE_ORDER; current_order >= min_order; --current_order) { + enum fallback_result result; + area = &(zone->free_area[current_order]); - fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, true); + result = find_suitable_fallback(area, current_order, + start_migratetype, true, &fallback_mt); - /* No block in that order */ - if (fallback_mt == -1) + if (result == FALLBACK_EMPTY) continue; - /* Advanced into orders too low to claim, abort */ - if (fallback_mt == -2) + if (result == FALLBACK_NOCLAIM) break; page = get_page_from_free_area(area, fallback_mt); @@ -2411,10 +2421,12 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype) int fallback_mt; for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { + enum fallback_result result; + area = &(zone->free_area[current_order]); - fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false); - if (fallback_mt == -1) + result = find_suitable_fallback(area, current_order, start_migratetype, + false, &fallback_mt); + if (result == FALLBACK_EMPTY) continue; page = get_page_from_free_area(area, fallback_mt); @@ -3424,7 +3436,7 @@ static void reserve_highatomic_pageblock(struct page *page, int order, struct zone *zone) { int mt; - unsigned long max_managed, flags; + unsigned long max_managed; /* * The number reserved as: minimum is 1 pageblock, maximum is @@ -3438,29 +3450,26 @@ static void reserve_highatomic_pageblock(struct page *page, int order, if (zone->nr_reserved_highatomic >= max_managed) return; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); /* Recheck the nr_reserved_highatomic limit under the lock */ if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; + return; /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (!migratetype_is_mergeable(mt)) - goto out_unlock; + return; if (order < pageblock_order) { if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; + return; zone->nr_reserved_highatomic += pageblock_nr_pages; } else { change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); zone->nr_reserved_highatomic += 1 << order; } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); } /* @@ -3476,7 +3485,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, bool force) { struct zonelist *zonelist = ac->zonelist; - unsigned long flags; struct zoneref *z; struct zone *zone; struct page *page; @@ -3493,7 +3501,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, pageblock_nr_pages) continue; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); unsigned long size; @@ -3540,12 +3548,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * so this should not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); + if (ret > 0) return ret; - } } - spin_unlock_irqrestore(&zone->lock, flags); } return false; @@ -4156,7 +4161,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, psi_memstall_leave(&pflags); delayacct_compact_end(); - if (*compact_result == COMPACT_SKIPPED) + if (*compact_result == COMPACT_SKIPPED || + *compact_result == COMPACT_DEFERRED) return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -4193,7 +4199,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, +should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, + int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4215,7 +4222,8 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * migration targets. Continue if reclaim can help. */ if (compact_result == COMPACT_SKIPPED) { - ret = compaction_zonelist_suitable(ac, order, alloc_flags); + ret = compaction_zonelist_suitable(ac, order, alloc_flags, + gfp_mask); goto out; } @@ -4268,7 +4276,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, +should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, + int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) @@ -4678,6 +4687,40 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask, + unsigned int order, unsigned long alloc_start_time) +{ + static DEFINE_SPINLOCK(alloc_stall_lock); + unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time); + + if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS)) + return; + if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies))) + return; + if (gfp_mask & __GFP_NOWARN) + return; + + if (!spin_trylock(&alloc_stall_lock)) + return; + + /* Check again, this time under the lock */ + if (time_is_after_jiffies(alloc_stall_warn_jiffies)) { + spin_unlock(&alloc_stall_lock); + return; + } + + WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS)); + spin_unlock(&alloc_stall_lock); + + pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl", + current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); + cpuset_print_current_mems_allowed(); + pr_cont("\n"); + dump_stack(); + warn_alloc_show_mem(gfp_mask, nodemask); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4698,6 +4741,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; bool compact_first = false; bool can_retry_reserves = true; + unsigned long alloc_start_time = jiffies; if (unlikely(nofail)) { /* @@ -4806,13 +4850,27 @@ retry: } /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) + if (!can_direct_reclaim) { + /* + * Reclaim/compaction cannot run, so defrag_mode's strategy + * of enforcing ALLOC_NOFRAGMENT cannot be fulfilled. Allow + * fallbacks rather than failing the allocation outright. + */ + if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT) && + (gfp_mask & __GFP_KSWAPD_RECLAIM)) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; + /* If allocation has taken excessively long, warn about it */ + check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time); + /* Try direct reclaim and then allocating */ if (!compact_first) { page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, @@ -4886,9 +4944,9 @@ retry: * of free memory (see __compaction_suitable) */ if (did_some_progress > 0 && can_compact && - should_compact_retry(ac, order, alloc_flags, - compact_result, &compact_priority, - &compaction_retries)) + should_compact_retry(gfp_mask, ac, order, alloc_flags, + compact_result, &compact_priority, + &compaction_retries)) goto retry; /* Reclaim/compaction failed to prevent the fallback */ @@ -5044,7 +5102,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct per_cpu_pages *pcp; struct list_head *pcp_list; struct alloc_context ac; - gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0, nr_account = 0; @@ -5085,10 +5142,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ gfp &= gfp_allowed_mask; - alloc_gfp = gfp; - if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags)) goto out; - gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ z = ac.preferred_zoneref; @@ -5180,6 +5235,34 @@ failed: EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* + * free_pages_bulk - Free an array of order-0 pages + * @page_array: Array of pages to free + * @nr_pages: The number of pages in the array + * + * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous + * run are released with a single __free_contig_range() call. + * + * This assumes page_array is sorted in ascending PFN order. Without that, + * the function still frees all pages, but contiguous runs may not be + * detected and the freeing pattern can degrade to freeing one page at a + * time. + * + * Context: Sleepable process context only; calls cond_resched() + */ +void free_pages_bulk(struct page **page_array, unsigned long nr_pages) +{ + while (nr_pages) { + unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages); + + __free_contig_range(page_to_pfn(*page_array), nr_contig); + + nr_pages -= nr_contig; + page_array += nr_contig; + cond_resched(); + } +} + +/* * This is the 'heart' of the zoned buddy allocator. */ struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, @@ -6758,6 +6841,105 @@ void __init page_alloc_sysctl_init(void) register_sysctl_init("vm", page_alloc_sysctl_table); } +static void free_prepared_contig_range(struct page *page, + unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + while (nr_pages) { + unsigned int order; + + /* We are limited by the largest buddy order. */ + order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER; + /* Don't exceed the number of pages to free. */ + order = min_t(unsigned int, order, ilog2(nr_pages)); + order = min_t(unsigned int, order, MAX_PAGE_ORDER); + + /* + * Free the chunk as a single block. Our caller has already + * called free_pages_prepare() for each order-0 page. + */ + __free_frozen_pages(page, order, FPI_PREPARED); + + pfn += 1UL << order; + page += 1UL << order; + nr_pages -= 1UL << order; + } +} + +static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages, + bool is_frozen) +{ + struct page *page, *start = NULL; + unsigned long nr_start = 0; + unsigned long start_sec; + unsigned long i; + + for (i = 0; i < nr_pages; i++) { + bool can_free = true; + + /* + * Contiguous PFNs might not have contiguous "struct pages" + * in some kernel configs: page++ across a section boundary + * is undefined. Use pfn_to_page() for each PFN. + */ + page = pfn_to_page(pfn + i); + + VM_WARN_ON_ONCE(PageHead(page)); + VM_WARN_ON_ONCE(PageTail(page)); + + if (!is_frozen) + can_free = put_page_testzero(page); + + if (can_free) + can_free = free_pages_prepare(page, 0); + + if (!can_free) { + if (start) { + free_prepared_contig_range(start, i - nr_start); + start = NULL; + } + continue; + } + + if (start && memdesc_section(page->flags) != start_sec) { + free_prepared_contig_range(start, i - nr_start); + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } else if (!start) { + start = page; + nr_start = i; + start_sec = memdesc_section(page->flags); + } + } + + if (start) + free_prepared_contig_range(start, nr_pages - nr_start); +} + +/** + * __free_contig_range - Free contiguous range of order-0 pages. + * @pfn: Page frame number of the first page in the range. + * @nr_pages: Number of pages to free. + * + * For each order-0 struct page in the physically contiguous range, put a + * reference. Free any page who's reference count falls to zero. The + * implementation is functionally equivalent to, but significantly faster than + * calling __free_page() for each struct page in a loop. + * + * Memory allocated with alloc_pages(order>=1) then subsequently split to + * order-0 with split_page() is an example of appropriate contiguous pages that + * can be freed with this API. + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ +void __free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false); +} + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) @@ -6895,8 +7077,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) { - for (; nr_pages--; pfn++) - free_frozen_pages(pfn_to_page(pfn), 0); + __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true); } /** @@ -7304,8 +7485,7 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) return; - for (; nr_pages--; pfn++) - __free_page(pfn_to_page(pfn)); + __free_contig_range(pfn, nr_pages); } EXPORT_SYMBOL(free_contig_range); #endif /* CONFIG_CONTIG_ALLOC */ @@ -7363,7 +7543,7 @@ void zone_pcp_reset(struct zone *zone) unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long already_offline = 0, flags; + unsigned long already_offline = 0; unsigned long pfn = start_pfn; struct page *page; struct zone *zone; @@ -7371,7 +7551,7 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); while (pfn < end_pfn) { page = pfn_to_page(pfn); /* @@ -7401,7 +7581,6 @@ unsigned long __offline_isolated_pages(unsigned long start_pfn, del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } - spin_unlock_irqrestore(&zone->lock, flags); return end_pfn - start_pfn - already_offline; } @@ -7473,11 +7652,9 @@ bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); @@ -7492,14 +7669,12 @@ bool take_page_off_buddy(struct page *page) break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - ret = true; - break; + return true; } if (page_count(page_head) > 0) break; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } /* @@ -7508,23 +7683,19 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long flags; - bool ret = false; - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (put_page_testzero(page)) { unsigned long pfn = page_to_pfn(page); int migratetype = get_pfnblock_migratetype(page, pfn); ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); - if (TestClearPageHWPoison(page)) { - ret = true; - } + if (TestClearPageHWPoison(page)) + return true; } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + return false; } #endif @@ -7774,8 +7945,8 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned * @order: allocation order size * * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * call from any context where RCU is watching (from atomic, NMI, and also + * reentrant allocator -> tracepoint -> alloc_pages_nolock_noprof). * Allocation is best effort and to be expected to fail easily so nobody should * rely on the success. Failures are not reported via warn_alloc(). * See always fail conditions below. diff --git a/mm/page_io.c b/mm/page_io.c index a59b73f8bdd9..60977c970cdf 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,7 @@ #include <linux/delayacct.h> #include <linux/zswap.h> #include "swap.h" +#include "swap_table.h" static void __end_swap_bio_write(struct bio *bio) { @@ -204,15 +205,20 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci = swap_cluster_get_and_lock(folio); for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); - set_bit(swp_offset(entry), sis->zeromap); + __swap_table_set_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); count_vm_events(SWPOUT_ZERO, nr_pages); if (objcg) { @@ -223,14 +229,19 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); + struct swap_cluster_info *ci; swp_entry_t entry; unsigned int i; + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + + ci = swap_cluster_get_and_lock(folio); for (i = 0; i < folio_nr_pages(folio); i++) { entry = page_swap_entry(folio_page(folio, i)); - clear_bit(swp_offset(entry), sis->zeromap); + __swap_table_clear_zero(ci, swp_cluster_offset(entry)); } + swap_cluster_unlock(ci); } /* @@ -255,10 +266,9 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) } /* - * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. - * The bits in zeromap are protected by the locked swapcache folio - * and atomic updates are used to protect against read-modify-write - * corruption due to other zero swap entries seeing concurrent updates. + * Use the swap table zero mark to avoid doing IO for zero-filled + * pages. The zero mark is protected by the cluster lock, which is + * acquired internally by swap_zeromap_folio_set/clear. */ if (is_folio_zero_filled(folio)) { swap_zeromap_folio_set(folio); @@ -326,8 +336,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct swap_iocb { struct kiocb iocb; - struct bio_vec bvec[SWAP_CLUSTER_MAX]; - int pages; + struct bio_vec bvecs[SWAP_CLUSTER_MAX]; + int nr_bvecs; int len; }; static mempool_t *sio_pool; @@ -348,7 +358,7 @@ int sio_pool_init(void) static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec[0].bv_page; + struct page *page = sio->bvecs[0].bv_page; int p; if (ret != sio->len) { @@ -362,15 +372,15 @@ static void sio_write_complete(struct kiocb *iocb, long ret) */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); - for (p = 0; p < sio->pages; p++) { - page = sio->bvec[p].bv_page; + for (p = 0; p < sio->nr_bvecs; p++) { + page = sio->bvecs[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); } } - for (p = 0; p < sio->pages; p++) - end_page_writeback(sio->bvec[p].bv_page); + for (p = 0; p < sio->nr_bvecs; p++) + end_page_writeback(sio->bvecs[p].bv_page); mempool_free(sio, sio_pool); } @@ -397,13 +407,13 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = pos; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) { swap_write_unplug(sio); sio = NULL; } @@ -477,7 +487,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -489,8 +499,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) int p; if (ret == sio->len) { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = bvec_folio(&sio->bvec[p]); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = bvec_folio(&sio->bvecs[p]); count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); @@ -499,8 +509,8 @@ static void sio_read_complete(struct kiocb *iocb, long ret) } count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { - for (p = 0; p < sio->pages; p++) { - struct folio *folio = bvec_folio(&sio->bvec[p]); + for (p = 0; p < sio->nr_bvecs; p++) { + struct folio *folio = bvec_folio(&sio->bvecs[p]); folio_unlock(folio); } @@ -509,19 +519,52 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +/* + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zerop is not NULL, + * it will return the zeromap status of the starting entry. + * + * Context: Caller must ensure the cluster containing the entries + * that will be checked won't be freed. + */ +static int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zerop) +{ + int i; + bool is_zero; + unsigned int ci_start = swp_cluster_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + + VM_WARN_ON_ONCE(ci_start + max_nr > SWAPFILE_CLUSTER); + + rcu_read_lock(); + is_zero = __swap_table_test_zero(ci, ci_start); + for (i = 1; i < max_nr; i++) + if (is_zero != __swap_table_test_zero(ci, ci_start + i)) + break; + rcu_read_unlock(); + if (is_zerop) + *is_zerop = is_zero; + + return i; +} + static bool swap_read_folio_zeromap(struct folio *folio) { int nr_pages = folio_nr_pages(folio); struct obj_cgroup *objcg; bool is_zeromap; + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). + * Folio lock stabilizes the cluster and map, so the check is safe. */ if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, - &is_zeromap) != nr_pages)) + &is_zeromap) != nr_pages)) return true; if (!is_zeromap) @@ -559,13 +602,13 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) init_sync_kiocb(&sio->iocb, sis->swap_file); sio->iocb.ki_pos = pos; sio->iocb.ki_complete = sio_read_complete; - sio->pages = 0; + sio->nr_bvecs = 0; sio->len = 0; } - bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0); sio->len += folio_size(folio); - sio->pages += 1; - if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + sio->nr_bvecs += 1; + if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) { swap_read_unplug(sio); sio = NULL; } @@ -666,7 +709,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c48ff5c00244..7a9d631945a3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -167,48 +167,40 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, { struct zone *zone = page_zone(page); struct page *unmovable; - unsigned long flags; unsigned long check_unmovable_start, check_unmovable_end; if (PageUnaccepted(page)) accept_page(page); - spin_lock_irqsave(&zone->lock, flags); - - /* - * We assume the caller intended to SET migrate type to isolate. - * If it is already set, then someone else must have raced and - * set it before us. - */ - if (is_migrate_isolate_page(page)) { - spin_unlock_irqrestore(&zone->lock, flags); - return -EBUSY; - } - - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - * - * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock - * to avoid redundant checks. - */ - check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), - end_pfn); - - unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, - mode); - if (!unmovable) { - if (!pageblock_isolate_and_move_free_pages(zone, page)) { - spin_unlock_irqrestore(&zone->lock, flags); + scoped_guard(spinlock_irqsave, &zone->lock) { + /* + * We assume the caller intended to SET migrate type to + * isolate. If it is already set, then someone else must have + * raced and set it before us. + */ + if (is_migrate_isolate_page(page)) return -EBUSY; + + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by + * itself. We just check MOVABLE pages. + * + * Pass the intersection of [start_pfn, end_pfn) and the page's + * pageblock to avoid redundant checks. + */ + check_unmovable_start = max(page_to_pfn(page), start_pfn); + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), + end_pfn); + + unmovable = has_unmovable_pages(check_unmovable_start, + check_unmovable_end, mode); + if (!unmovable) { + if (!pageblock_isolate_and_move_free_pages(zone, page)) + return -EBUSY; + zone->nr_isolate_pageblock++; + return 0; } - zone->nr_isolate_pageblock++; - spin_unlock_irqrestore(&zone->lock, flags); - return 0; } - - spin_unlock_irqrestore(&zone->lock, flags); if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) { /* * printk() with zone->lock held will likely trigger a @@ -223,15 +215,14 @@ static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode, static void unset_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); + guard(spinlock_irqsave)(&zone->lock); if (!is_migrate_isolate_page(page)) - goto out; + return; /* * Because freepage with more than pageblock_order on isolated @@ -279,8 +270,6 @@ static void unset_migratetype_isolate(struct page *page) __putback_isolated_page(page, order, get_pageblock_migratetype(page)); } zone->nr_isolate_pageblock--; -out: - spin_unlock_irqrestore(&zone->lock, flags); } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c index 8178e0be557f..2dddcb6510aa 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -573,7 +573,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], - &page->flags); + &page->flags.f); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4d52fdb3056..2ccbabfb2cc1 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -41,7 +41,7 @@ again: if (!pvmw->pte) return false; - ptent = ptep_get(pvmw->pte); + ptent = ptep_get_lockless(pvmw->pte); if (pte_none(ptent)) { return false; @@ -183,6 +183,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) struct mm_struct *mm = vma->vm_mm; unsigned long end; spinlock_t *ptl; + pte_t pteval; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -310,7 +311,11 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(ptep_get(pvmw->pte))); + if (!pvmw->ptl) + pteval = ptep_get_lockless(pvmw->pte); + else + pteval = ptep_get(pvmw->pte); + } while (pte_none(pteval)); if (!pvmw->ptl) { spin_lock(ptl); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 4b3d6ec43703..8cbe039bf847 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -77,13 +77,13 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ + int nr_pages; /* # of pages served by this chunk */ + int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif - int nr_pages; /* # of pages served by this chunk */ - int nr_populated; /* # of populated pages */ - int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; diff --git a/mm/readahead.c b/mm/readahead.c index 7b05082c89ea..38ce16e3fcbd 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -146,6 +146,17 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); +/** + * read_pages() - Start IO for a contiguous range of allocated folios in the + * page cache. + * @rac: Readahead control. + * + * When read_pages() returns, it is guaranteed that all of the folios will have + * been processed or removed so that ``readahead_count(rac) == 0``. However, + * that does not imply that ``readahead_index(rac)`` will be updated to point + * to the end of the originally requested range because, for example, the + * filesystem may expand the range upwards. + */ static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; @@ -270,7 +281,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ read_pages(ractl); ractl->_index += min_nrpages; - i = ractl->_index + ractl->_nr_pages - index; + i = ractl->_index - index; continue; } @@ -286,7 +297,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, break; read_pages(ractl); ractl->_index += min_nrpages; - i = ractl->_index + ractl->_nr_pages - index; + i = ractl->_index - index; continue; } if (i == mark) @@ -324,11 +335,16 @@ static void do_page_cache_ra(struct readahead_control *ractl, return; end_index = (isize - 1) >> PAGE_SHIFT; + if (end_index > ractl->_max_index) + end_index = ractl->_max_index; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ - if (nr_to_read > end_index - index) + if (nr_to_read > end_index - index) { nr_to_read = end_index - index + 1; + /* We've reached the end, so don't set a readahead marker. */ + lookahead_size = 0; + } filemap_invalidate_lock_shared(mapping); page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); @@ -471,8 +487,8 @@ void page_cache_ra_order(struct readahead_control *ractl, pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); - pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; - pgoff_t mark = index + ra->size - ra->async_size; + pgoff_t limit; + pgoff_t mark; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); @@ -484,7 +500,15 @@ void page_cache_ra_order(struct readahead_control *ractl, goto fallback; } - limit = min(limit, index + ra->size - 1); + limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + limit = min(limit, ractl->_max_index); + if (limit > index + ra->size - 1) { + limit = index + ra->size - 1; + mark = index + ra->size - ra->async_size; + } else { + /* We've reached the end, so don't set a readahead marker. */ + mark = ULONG_MAX; + } new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); diff --git a/mm/rmap.c b/mm/rmap.c index 99e1b3dc390b..1c77d5dc06e9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -571,7 +571,7 @@ void __init anon_vma_init(void) * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as - * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * long as we observe folio_mapped() [ hence all those folio_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it @@ -1999,7 +1999,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_unmap() may return before page_mapped() has become false, + * try_to_unmap() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2428,7 +2428,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), - * try_to_migrate() may return before page_mapped() has become false, + * try_to_migrate() may return before folio_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) @@ -2929,7 +2929,7 @@ static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() - * because that depends on page_mapped(); but not all its usages + * because that depends on folio_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ diff --git a/mm/shmem.c b/mm/shmem.c index 7b1ea9fb598f..b51f83c970bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, vm_fault_t *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -1789,30 +1789,6 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, return folio; } -/* - * Make sure huge_gfp is always more limited than limit_gfp. - * Some of the flags set permissions, while others set limitations. - */ -static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) -{ - gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; - gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; - gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; - gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); - - /* Allow allocations only from the originally specified zones. */ - result |= zoneflags; - - /* - * Minimize the result gfp by taking the union with the deny flags, - * and the intersection of the allow flags. - */ - result |= (limit_gfp & denyflags); - result |= (huge_gfp & limit_gfp) & allowflags; - - return result; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { @@ -2039,68 +2015,32 @@ unlock: } static struct folio *shmem_swap_alloc_folio(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, + struct vm_fault *vmf, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { + pgoff_t ilx; + struct folio *folio; + struct mempolicy *mpol; struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *new, *swapcache; - int nr_pages = 1 << order; - gfp_t alloc_gfp = gfp; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (WARN_ON_ONCE(order)) - return ERR_PTR(-EINVAL); - } else if (order) { - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(entry, nr_pages) != nr_pages) - goto fallback; - alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); - } -retry: - new = shmem_alloc_folio(alloc_gfp, order, info, index); - if (!new) { - new = ERR_PTR(-ENOMEM); - goto fallback; - } + if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) || + !zswap_never_enabled()) + order = 0; - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - alloc_gfp, entry)) { - folio_put(new); - new = ERR_PTR(-ENOMEM); - goto fallback; - } +again: + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = swapin_sync(entry, gfp, BIT(order), vmf, mpol, ilx); + mpol_cond_put(mpol); - swapcache = swapin_folio(entry, new); - if (swapcache != new) { - folio_put(new); - if (!swapcache) { - /* - * The new folio is charged already, swapin can - * only fail due to another raced swapin. - */ - new = ERR_PTR(-EEXIST); - goto fallback; - } + if (!IS_ERR(folio)) + return folio; + + if (order) { + order = 0; + goto again; } - return swapcache; -fallback: - /* Order 0 swapin failed, nothing to fallback to, abort */ - if (!order) - return new; - entry.val += index - round_down(index, nr_pages); - alloc_gfp = gfp; - nr_pages = 1; - order = 0; - goto retry; + + return folio; } /* @@ -2139,7 +2079,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); - gfp = limit_gfp_mask(huge_gfp, gfp); + gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp); } #endif @@ -2287,11 +2227,12 @@ unlock: */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; - struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; @@ -2332,20 +2273,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio = shmem_swap_alloc_folio(inode, vma, index, - index_entry, order, gfp); - if (IS_ERR(folio)) { - error = PTR_ERR(folio); - folio = NULL; - goto failed; - } + folio = shmem_swap_alloc_folio(inode, vmf, index, + swap, order, gfp); } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { + } + if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) + error = PTR_ERR(folio); + else error = -ENOMEM; - goto failed; - } + folio = NULL; + goto failed; } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -2493,7 +2433,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vmf, fault_type); if (error == -EEXIST) goto repeat; @@ -2546,7 +2486,7 @@ repeat: gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); + huge_gfp = thp_shmem_limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { @@ -3100,10 +3040,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - - /* Don't consider 'deny' for emergencies and 'force' for testing */ - if (sbinfo->huge) - mapping_set_large_folios(inode->i_mapping); + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: @@ -5510,24 +5447,74 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); +enum huge_mode { + HUGE_SHMEM_ENABLED_ALWAYS = 0, + HUGE_SHMEM_ENABLED_INHERIT, + HUGE_SHMEM_ENABLED_WITHIN_SIZE, + HUGE_SHMEM_ENABLED_ADVISE, + HUGE_SHMEM_ENABLED_NEVER, +}; + +static const char * const huge_mode_strings[] = { + [HUGE_SHMEM_ENABLED_ALWAYS] = "always", + [HUGE_SHMEM_ENABLED_INHERIT] = "inherit", + [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = "within_size", + [HUGE_SHMEM_ENABLED_ADVISE] = "advise", + [HUGE_SHMEM_ENABLED_NEVER] = "never", +}; + +static unsigned long * const huge_mode_orders[] = { + [HUGE_SHMEM_ENABLED_ALWAYS] = &huge_shmem_orders_always, + [HUGE_SHMEM_ENABLED_INHERIT] = &huge_shmem_orders_inherit, + [HUGE_SHMEM_ENABLED_WITHIN_SIZE] = &huge_shmem_orders_within_size, + [HUGE_SHMEM_ENABLED_ADVISE] = &huge_shmem_orders_madvise, +}; + static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; - const char *output; - - if (test_bit(order, &huge_shmem_orders_always)) - output = "[always] inherit within_size advise never"; - else if (test_bit(order, &huge_shmem_orders_inherit)) - output = "always [inherit] within_size advise never"; - else if (test_bit(order, &huge_shmem_orders_within_size)) - output = "always inherit [within_size] advise never"; - else if (test_bit(order, &huge_shmem_orders_madvise)) - output = "always inherit within_size [advise] never"; - else - output = "always inherit within_size advise [never]"; + int active = HUGE_SHMEM_ENABLED_NEVER; + int len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(huge_mode_orders); i++) { + if (test_bit(order, huge_mode_orders[i])) { + active = i; + break; + } + } + + for (i = 0; i < ARRAY_SIZE(huge_mode_strings); i++) { + if (i == active) + len += sysfs_emit_at(buf, len, "[%s] ", + huge_mode_strings[i]); + else + len += sysfs_emit_at(buf, len, "%s ", + huge_mode_strings[i]); + } + + /* Replace trailing space with newline */ + buf[len - 1] = '\n'; + + return len; +} + +static bool set_shmem_enabled_mode(int order, enum huge_mode mode) +{ + bool changed = false; + enum huge_mode idx; + + spin_lock(&huge_shmem_orders_lock); + for (idx = 0; idx < ARRAY_SIZE(huge_mode_orders); idx++) { + if (idx == mode) + changed |= !__test_and_set_bit(order, huge_mode_orders[idx]); + else + changed |= __test_and_clear_bit(order, huge_mode_orders[idx]); + } + spin_unlock(&huge_shmem_orders_lock); - return sysfs_emit(buf, "%s\n", output); + return changed; } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, @@ -5535,58 +5522,31 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; - ssize_t ret = count; - - if (sysfs_streq(buf, "always")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_madvise); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_always); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "inherit")) { - /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) - return -EINVAL; + int mode; - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_madvise); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_inherit); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "within_size")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_madvise); - set_bit(order, &huge_shmem_orders_within_size); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "advise")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_within_size); - set_bit(order, &huge_shmem_orders_madvise); - spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "never")) { - spin_lock(&huge_shmem_orders_lock); - clear_bit(order, &huge_shmem_orders_always); - clear_bit(order, &huge_shmem_orders_inherit); - clear_bit(order, &huge_shmem_orders_within_size); - clear_bit(order, &huge_shmem_orders_madvise); - spin_unlock(&huge_shmem_orders_lock); - } else { - ret = -EINVAL; - } + mode = sysfs_match_string(huge_mode_strings, buf); + if (mode < 0) + return mode; - if (ret > 0) { - int err = start_stop_khugepaged(); + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (mode == HUGE_SHMEM_ENABLED_INHERIT && + shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) + return -EINVAL; + if (set_shmem_enabled_mode(order, mode)) { + int err = start_stop_khugepaged(); if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode hasn't changed + * to preserve the legacy behavior, as this is always called + * inside start_stop_khugepaged(). + */ + set_recommended_min_free_kbytes(); } - return ret; + + return count; } struct kobj_attribute thpsize_shmem_enabled_attr = diff --git a/mm/shrinker.c b/mm/shrinker.c index 76b3f750cf65..7082d01c8c9d 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -197,12 +197,13 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { + struct shrinker_info_unit *unit; + + unit = info->unit[shrinker_id_to_index(shrinker_id)]; /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id_to_offset(shrinker_id), unit->map); @@ -215,29 +216,26 @@ static DEFINE_IDR(shrinker_idr); static int shrinker_memcg_alloc(struct shrinker *shrinker) { - int id, ret = -ENOMEM; + int id; if (mem_cgroup_disabled()) return -ENOSYS; if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB)) return -ENOSYS; - mutex_lock(&shrinker_mutex); + guard(mutex)(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) - goto unlock; + return id; if (id >= shrinker_nr_max) { if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); - goto unlock; + return -ENOMEM; } } shrinker->id = id; - ret = 0; -unlock: - mutex_unlock(&shrinker_mutex); - return ret; + return 0; } static void shrinker_memcg_remove(struct shrinker *shrinker) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 6eadb9d116e4..99e2be39671b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap) { - void *ptr; - if (altmap) return altmap_alloc_block_buf(size, altmap); - ptr = sparse_buffer_alloc(size); - if (!ptr) - ptr = vmemmap_alloc_block(size, node); - return ptr; + return vmemmap_alloc_block(size, node); } static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) @@ -151,7 +146,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node, start, end - 1); } -pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, +static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, unsigned long ptpfn, unsigned long flags) { @@ -195,7 +190,7 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) return p; } -pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +static pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { @@ -208,7 +203,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) +static pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { @@ -221,7 +216,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) return pud; } -p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +static p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { @@ -234,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) return p4d; } -pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { @@ -391,12 +386,17 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) { + WARN_ON_ONCE(!pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL)); } int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { - return 0; + if (!pmd_leaf(pmdp_get(pmd))) + return 0; + vmemmap_verify((pte_t *)pmd, node, addr, next); + + return 1; } int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, @@ -652,26 +652,61 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } +static int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + const unsigned int order = pgmap ? pgmap->vmemmap_shift : 0; + const unsigned long pages_per_compound = 1UL << order; + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION)); + VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION); + + if (!vmemmap_can_optimize(altmap, pgmap)) + return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE); + + if (order < PFN_SECTION_SHIFT) { + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound)); + return VMEMMAP_RESERVE_NR * nr_pages / pages_per_compound; + } + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)); + + if (IS_ALIGNED(pfn, pages_per_compound)) + return VMEMMAP_RESERVE_NR; + + return 0; +} + static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap, + pgmap); + + memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); + + return page; } static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); + memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap)); vmemmap_free(start, end, altmap); } + static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long pfn = page_to_pfn(memmap); + memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION, + NULL, NULL)); vmemmap_free(start, end, NULL); } @@ -737,7 +772,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) * usage map, but still need to free the vmemmap range. */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); @@ -774,14 +809,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); - depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); + if (!section_is_early) + depopulate_section_memmap(pfn, nr_pages, altmap, pgmap); + else if (memmap) free_map_bootmem(memmap); - } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -823,10 +854,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); return ERR_PTR(-ENOMEM); } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } @@ -885,13 +915,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, } void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) + struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; - section_deactivate(pfn, nr_pages, altmap); + section_deactivate(pfn, nr_pages, altmap, pgmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/sparse.c b/mm/sparse.c index effdac6b0ab1..16ac6df3c89f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -201,13 +201,11 @@ static void __init memblocks_present(void) int i, nid; #ifdef CONFIG_SPARSEMEM_EXTREME - if (unlikely(!mem_section)) { - unsigned long size, align; + unsigned long size, align; - size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc_or_panic(size, align); - } + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc_or_panic(size, align); #endif for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) @@ -241,12 +239,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn, struct dev_pagemap *pgmap) { unsigned long size = section_map_size(); - struct page *map = sparse_buffer_alloc(size); + struct page *map; phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - if (map) - return map; - map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -256,55 +251,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn, } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -static void *sparsemap_buf __meminitdata; -static void *sparsemap_buf_end __meminitdata; - -static inline void __meminit sparse_buffer_free(unsigned long size) -{ - WARN_ON(!sparsemap_buf || size == 0); - memblock_free(sparsemap_buf, size); -} - -static void __init sparse_buffer_init(unsigned long size, int nid) -{ - phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - /* - * Pre-allocated buffer is mainly used by __populate_section_memmap - * and we want it to be properly aligned to the section size - this is - * especially the case for VMEMMAP which maps memmap to PMDs - */ - sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); - sparsemap_buf_end = sparsemap_buf + size; -} - -static void __init sparse_buffer_fini(void) -{ - unsigned long size = sparsemap_buf_end - sparsemap_buf; - - if (sparsemap_buf && size > 0) - sparse_buffer_free(size); - sparsemap_buf = NULL; -} - -void * __meminit sparse_buffer_alloc(unsigned long size) -{ - void *ptr = NULL; - - if (sparsemap_buf) { - ptr = (void *) roundup((unsigned long)sparsemap_buf, size); - if (ptr + size > sparsemap_buf_end) - ptr = NULL; - else { - /* Free redundant aligned space */ - if ((unsigned long)(ptr - sparsemap_buf) > 0) - sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); - sparsemap_buf = ptr + size; - } - } - return ptr; -} - void __weak __meminit vmemmap_populate_print_last(void) { } @@ -362,8 +308,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, goto failed; } - sparse_buffer_init(map_count * section_map_size(), nid); - sparse_vmemmap_init_nid_early(nid); for_each_present_section_nr(pnum_begin, pnum) { @@ -381,7 +325,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, __func__, nid); pnum_begin = pnum; sparse_usage_fini(); - sparse_buffer_fini(); goto failed; } memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), @@ -390,7 +333,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, } } sparse_usage_fini(); - sparse_buffer_fini(); return; failed: /* diff --git a/mm/swap.c b/mm/swap.c index 5cc44f0de987..588f50d8f1a8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -160,13 +160,41 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; + struct folio_batch free_fbatch; + bool is_lru_add = (move_fn == lru_add); + + /* + * If we're adding to the LRU, preemptively filter dead folios. Use + * this dedicated folio batch for temp storage and deferred cleanup. + */ + if (is_lru_add) + folio_batch_init(&free_fbatch); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ - if (move_fn != lru_add && !folio_test_clear_lru(folio)) + if (!is_lru_add && !folio_test_clear_lru(folio)) + continue; + + /* + * Filter dead folios by moving them from the add batch to the temp + * batch for freeing after this loop. + * + * We're bypassing normal cleanup. Clear flags that are not + * applicable to dead folios. + * + * Since the folio may be part of a huge page, unqueue from + * deferred split list to avoid a dangling list entry. + */ + if (is_lru_add && folio_ref_freeze(folio, 1)) { + __folio_clear_active(folio); + __folio_clear_unevictable(folio); + folio_unqueue_deferred_split(folio); + fbatch->folios[i] = NULL; + folio_batch_add(&free_fbatch, folio); continue; + } folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,6 +204,13 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) lruvec_unlock_irqrestore(lruvec, flags); + + /* Cleanup filtered dead folios. */ + if (is_lru_add) { + mem_cgroup_uncharge_folios(&free_fbatch); + free_unref_folios(&free_fbatch); + } + folios_put(fbatch); } @@ -509,10 +544,20 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_folio_seq() */ + /* + * For refaulted workingset folios, set PG_active so they + * can be added to active generations. + * For prefaulted file folios, folio_mark_accessed() sets + * PG_referenced so lru_gen_folio_seq() places them into + * the second oldest generation. + */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && - lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) - folio_set_active(folio); + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) { + if (folio_test_workingset(folio)) + folio_set_active(folio); + else if (!folio_test_referenced(folio)) + folio_mark_accessed(folio); + } folio_batch_add_and_move(folio, lru_add); } @@ -964,6 +1009,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; + /* Folio batch entry may have been preemptively removed during drain. */ + if (!folio) + continue; + if (is_huge_zero_folio(folio)) continue; diff --git a/mm/swap.h b/mm/swap.h index a77016f2423b..77d2d14eda42 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,11 +3,29 @@ #define _MM_SWAP_H #include <linux/atomic.h> /* for atomic_long_t */ +#include <linux/mm.h> /* for PAGE_SHIFT */ struct mempolicy; struct swap_iocb; +struct swap_memcg_table; extern int page_cluster; +#if defined(MAX_POSSIBLE_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) +#elif defined(MAX_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#else +#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#endif + +/* Swap table marker, 0x1 means shadow, 0x2 means PFN (SWP_TB_PFN_MARK) */ +#define SWAP_CACHE_PFN_MARK_BITS 2 +/* At least 2 bits are needed to distinguish SWP_TB_COUNT_MAX, 1 and 0 */ +#define SWAP_COUNT_MIN_BITS 2 +/* If there are enough bits besides PFN and marker, store zero flag inline */ +#define SWAP_TABLE_HAS_ZEROFLAG ((BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - \ + SWAP_CACHE_PFN_BITS) > SWAP_COUNT_MIN_BITS) + #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) @@ -38,6 +56,12 @@ struct swap_cluster_info { u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ unsigned int *extend_table; /* For large swap count, protected by ci->lock */ +#ifdef CONFIG_MEMCG + struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */ +#endif +#if !SWAP_TABLE_HAS_ZEROFLAG + unsigned long *zero_bitmap; +#endif struct list_head list; }; @@ -280,9 +304,9 @@ bool swap_cache_has_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); void swap_cache_del_folio(struct folio *folio); -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, - struct mempolicy *mpol, pgoff_t ilx, - bool *alloced); +struct folio *swap_cache_alloc_folio(swp_entry_t target_entry, gfp_t gfp_mask, + unsigned long orders, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx); /* Below helpers require the caller to lock and pass in the swap cluster. */ void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry); @@ -300,7 +324,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); +struct folio *swapin_sync(swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -309,49 +334,6 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return __swap_entry_to_info(folio->swap)->flags; } -/* - * Return the count of contiguous swap entries that share the same - * zeromap status as the starting entry. If is_zeromap is not NULL, - * it will return the zeromap status of the starting entry. - */ -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *is_zeromap) -{ - struct swap_info_struct *sis = __swap_entry_to_info(entry); - unsigned long start = swp_offset(entry); - unsigned long end = start + max_nr; - bool first_bit; - - first_bit = test_bit(start, sis->zeromap); - if (is_zeromap) - *is_zeromap = first_bit; - - if (max_nr <= 1) - return max_nr; - if (first_bit) - return find_next_zero_bit(sis->zeromap, end, start) - start; - else - return find_next_bit(sis->zeromap, end, start) - start; -} - -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - int i; - - /* - * While allocating a large folio and doing mTHP swapin, we need to - * ensure all entries are not cached, otherwise, the mTHP folio will - * be in conflict with the folio in swap cache. - */ - for (i = 0; i < max_nr; i++) { - if (swap_cache_has_folio(entry)) - return i; - entry.val++; - } - - return i; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -433,7 +415,9 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +static inline struct folio *swapin_sync( + swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } @@ -488,15 +472,5 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return 0; } -static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, - bool *has_zeromap) -{ - return 0; -} - -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c deleted file mode 100644 index de779fed8c21..000000000000 --- a/mm/swap_cgroup.c +++ /dev/null @@ -1,172 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/swap_cgroup.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> - -#include <linux/swapops.h> /* depends on mm.h include */ - -static DEFINE_MUTEX(swap_cgroup_mutex); - -/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ -#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) -#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) -#define ID_MASK (BIT(ID_SHIFT) - 1) -struct swap_cgroup { - atomic_t ids; -}; - -struct swap_cgroup_ctrl { - struct swap_cgroup *map; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, - pgoff_t offset) -{ - unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; - unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); - - BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); - BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - - return (old_ids >> shift) & ID_MASK; -} - -static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, - pgoff_t offset, - unsigned short new_id) -{ - unsigned short old_id; - struct swap_cgroup *sc = &map[offset / ID_PER_SC]; - unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; - unsigned int new_ids, old_ids = atomic_read(&sc->ids); - - do { - old_id = (old_ids >> shift) & ID_MASK; - new_ids = (old_ids & ~(ID_MASK << shift)); - new_ids |= ((unsigned int)new_id) << shift; - } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); - - return old_id; -} - -/** - * swap_cgroup_record - record mem_cgroup for a set of swap entries. - * These entries must belong to one single folio, and that folio - * must be being charged for swap space (swap out), and these - * entries must not have been charged - * - * @folio: the folio that the swap entry belongs to - * @id: mem_cgroup ID to be recorded - * @ent: the first swap entry to be recorded - */ -void swap_cgroup_record(struct folio *folio, unsigned short id, - swp_entry_t ent) -{ - unsigned int nr_ents = folio_nr_pages(folio); - struct swap_cgroup *map; - pgoff_t offset, end; - unsigned short old; - - offset = swp_offset(ent); - end = offset + nr_ents; - map = swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old = __swap_cgroup_id_xchg(map, offset, id); - VM_BUG_ON(old); - } while (++offset != end); -} - -/** - * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. - * These entries must be being uncharged from swap. They either - * belongs to one single folio in the swap cache (swap in for - * cgroup v1), or no longer have any users (slot freeing). - * - * @ent: the first swap entry to be recorded into - * @nr_ents: number of swap entries to be recorded - * - * Returns the existing old value. - */ -unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) -{ - pgoff_t offset, end; - struct swap_cgroup *map; - unsigned short old, iter = 0; - - offset = swp_offset(ent); - end = offset + nr_ents; - map = swap_cgroup_ctrl[swp_type(ent)].map; - - do { - old = __swap_cgroup_id_xchg(map, offset, 0); - if (!iter) - iter = old; - VM_BUG_ON(iter != old); - } while (++offset != end); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return 0; - - BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != - sizeof(struct swap_cgroup)); - map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * - sizeof(struct swap_cgroup)); - if (!map) - goto nomem; - - ctrl = &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->map = map; - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - pr_info("couldn't allocate enough memory for swap_cgroup\n"); - pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct swap_cgroup *map; - struct swap_cgroup_ctrl *ctrl; - - if (mem_cgroup_disabled()) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl = &swap_cgroup_ctrl[type]; - map = ctrl->map; - ctrl->map = NULL; - mutex_unlock(&swap_cgroup_mutex); - - vfree(map); -} diff --git a/mm/swap_state.c b/mm/swap_state.c index 1415a5c54a43..9c3a5cf99778 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -137,8 +137,71 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -void __swap_cache_add_folio(struct swap_cluster_info *ci, - struct folio *folio, swp_entry_t entry) +/** + * __swap_cache_add_check - Check if a range is suitable for adding a folio. + * @ci: The locked swap cluster + * @targ_entry: The target swap entry to check, will be rounded down by @nr + * @nr: Number of slots to check, must be a power of 2 + * @shadowp: Returns the shadow value if one exists in the range + * @memcg_id: Returns the memory cgroup id, NULL to ignore cgroup check + * + * Check if all slots covered by given range have a swap count >= 1. + * Retrieves the shadow if there is one. If @memcg_id is not NULL, also + * checks if all slots belong to the same cgroup and return the cgroup + * private id. + * + * Context: Caller must lock the cluster. + * Return: 0 if success, error code if failed. + */ +static int __swap_cache_add_check(struct swap_cluster_info *ci, + swp_entry_t targ_entry, + unsigned long nr, void **shadowp, + unsigned short *memcg_id) +{ + unsigned int ci_off, ci_end; + unsigned long old_tb; + bool is_zero; + + lockdep_assert_held(&ci->lock); + + /* + * If the target slot is not swapped out or already cached, return + * -ENOENT or -EEXIST. If the batch is not suitable, could be a + * race with concurrent free or cache add, return -EBUSY. + */ + if (unlikely(!ci->table)) + return -ENOENT; + ci_off = swp_cluster_offset(targ_entry); + old_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_folio(old_tb)) + return -EEXIST; + if (!__swp_tb_get_count(old_tb)) + return -ENOENT; + if (shadowp && swp_tb_is_shadow(old_tb)) + *shadowp = swp_tb_to_shadow(old_tb); + if (memcg_id) + *memcg_id = __swap_cgroup_get(ci, ci_off); + + if (nr == 1) + return 0; + + is_zero = __swap_table_test_zero(ci, ci_off); + ci_off = round_down(ci_off, nr); + ci_end = ci_off + nr; + do { + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb) || + !__swp_tb_get_count(old_tb) || + is_zero != __swap_table_test_zero(ci, ci_off) || + (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off)))) + return -EBUSY; + } while (++ci_off < ci_end); + + return 0; +} + +static void __swap_cache_do_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) { unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); @@ -153,88 +216,42 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, do { old_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; - - node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } /** - * swap_cache_add_folio - Add a folio into the swap cache. + * __swap_cache_add_folio - Add a folio to the swap cache and update stats. + * @ci: The locked swap cluster. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. - * @gfp: gfp_mask for XArray node allocation. - * @shadowp: If a shadow is found, return the shadow. * - * Context: Caller must ensure @entry is valid and protect the swap device - * with reference count or locks. + * Unconditionally add a folio to the swap cache. The caller must ensure + * all slots are usable and have no conflicts. This assigns entry to + * @folio->swap, increases folio refcount by the number of pages, and + * updates swap cache stats. + * + * Context: Caller must ensure the folio is locked and lock the cluster + * that holds the entries. */ -static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadowp) +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) { - int err; - void *shadow = NULL; - unsigned long old_tb; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); - si = __swap_entry_to_info(entry); - ci_start = swp_cluster_offset(entry); - ci_end = ci_start + nr_pages; - ci_off = ci_start; - ci = swap_cluster_lock(si, swp_offset(entry)); - if (unlikely(!ci->table)) { - err = -ENOENT; - goto failed; - } - do { - old_tb = __swap_table_get(ci, ci_off); - if (unlikely(swp_tb_is_folio(old_tb))) { - err = -EEXIST; - goto failed; - } - if (unlikely(!__swp_tb_get_count(old_tb))) { - err = -ENOENT; - goto failed; - } - if (swp_tb_is_shadow(old_tb)) - shadow = swp_tb_to_shadow(old_tb); - } while (++ci_off < ci_end); - __swap_cache_add_folio(ci, folio, entry); - swap_cluster_unlock(ci); - if (shadowp) - *shadowp = shadow; - return 0; - -failed: - swap_cluster_unlock(ci); - return err; + __swap_cache_do_add_folio(ci, folio, entry); + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } -/** - * __swap_cache_del_folio - Removes a folio from the swap cache. - * @ci: The locked swap cluster. - * @folio: The folio. - * @entry: The first swap entry that the folio corresponds to. - * @shadow: shadow value to be filled in the swap cache. - * - * Removes a folio from the swap cache and fills a shadow in place. - * This won't put the folio's refcount. The caller has to do that. - * - * Context: Caller must ensure the folio is locked and in the swap cache - * using the index of @entry, and lock the cluster that holds the entries. - */ -void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, - swp_entry_t entry, void *shadow) +static void __swap_cache_do_del_folio(struct swap_cluster_info *ci, + struct folio *folio, + swp_entry_t entry, void *shadow) { - int count; unsigned long old_tb; struct swap_info_struct *si; unsigned int ci_start, ci_off, ci_end; @@ -254,19 +271,17 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - count = __swp_tb_get_count(old_tb); - if (count) + if (__swp_tb_get_count(old_tb)) folio_swapped = true; else need_free = true; - /* If shadow is NULL, we sets an empty shadow. */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); + /* If shadow is NULL, we set an empty shadow. */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, + __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); - node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { __swap_cluster_free_entries(si, ci, ci_start, nr_pages); @@ -280,6 +295,29 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, } /** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. + */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, + swp_entry_t entry, void *shadow) +{ + unsigned long nr_pages = folio_nr_pages(folio); + + __swap_cache_do_del_folio(ci, folio, entry, shadow); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); +} + +/** * swap_cache_del_folio - Removes a folio from the swap cache. * @folio: The folio. * @@ -333,7 +371,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, do { old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); - __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_flags(old_tb))); } while (++ci_off < ci_end); /* @@ -351,6 +389,153 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, } /* + * Try to allocate a folio of given order in the swap cache. + * + * This helper resolves the potential races of swap allocation + * and prepares a folio to be used for swap IO. May return following + * value: + * + * -ENOMEM / -EBUSY: Order is too large or in conflict with sub slot, + * caller should shrink the order and retry + * -ENOENT / -EEXIST: Target swap entry is unavailable or cached, the caller + * should abort or try to use the cached folio instead + */ +static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, + swp_entry_t targ_entry, gfp_t gfp, + unsigned int order, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int err; + swp_entry_t entry; + struct folio *folio; + void *shadow = NULL; + unsigned short memcg_id; + unsigned long address, nr_pages = 1UL << order; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + + VM_WARN_ON_ONCE(nr_pages > SWAPFILE_CLUSTER); + entry.val = round_down(targ_entry.val, nr_pages); + + /* Check if the slot and range are available, skip allocation if not */ + spin_lock(&ci->lock); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, NULL, NULL); + spin_unlock(&ci->lock); + if (unlikely(err)) + return ERR_PTR(err); + + /* + * Limit THP gfp. The limitation is a no-op for typical + * GFP_HIGHUSER_MOVABLE but matters for shmem. + */ + if (order) + gfp = thp_shmem_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + + if (mpol || !vmf) { + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + } else { + address = round_down(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vmf->vma, address); + } + if (unlikely(!folio)) + return ERR_PTR(-ENOMEM); + + /* Double check the range is still not in conflict */ + spin_lock(&ci->lock); + err = __swap_cache_add_check(ci, targ_entry, nr_pages, &shadow, &memcg_id); + if (unlikely(err)) { + spin_unlock(&ci->lock); + folio_put(folio); + return ERR_PTR(err); + } + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __swap_cache_do_add_folio(ci, folio, entry); + spin_unlock(&ci->lock); + + if (mem_cgroup_swapin_charge_folio(folio, memcg_id, + vmf ? vmf->vma->vm_mm : NULL, gfp)) { + spin_lock(&ci->lock); + __swap_cache_do_del_folio(ci, folio, entry, shadow); + spin_unlock(&ci->lock); + folio_unlock(folio); + /* nr_pages refs from swap cache, 1 from allocation */ + folio_put_refs(folio, nr_pages + 1); + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); + return ERR_PTR(-ENOMEM); + } + + if (order > 1 && folio_memcg_alloc_deferred(folio)) { + spin_lock(&ci->lock); + __swap_cache_do_del_folio(ci, folio, entry, shadow); + spin_unlock(&ci->lock); + folio_unlock(folio); + /* nr_pages refs from swap cache, 1 from allocation */ + folio_put_refs(folio, nr_pages + 1); + return ERR_PTR(-ENOMEM); + } + + /* memsw uncharges swap when folio is added to swap cache */ + memcg1_swapin(folio); + if (shadow) + workingset_refault(folio, shadow); + + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); + + /* Caller will initiate read into locked new_folio */ + folio_add_lru(folio); + return folio; +} + +/** + * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. + * @targ_entry: swap entry indicating the target slot + * @gfp: memory allocation flags + * @orders: allocation orders, must be non zero + * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE + * + * Allocate a folio in the swap cache for one swap slot, typically before + * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by + * @targ_entry must have a non-zero swap count (swapped out). + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the folio if allocation succeeded and folio is in the swap + * cache. Returns error code if failed due to race, OOM or invalid arguments. + */ +struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp, + unsigned long orders, struct vm_fault *vmf, + struct mempolicy *mpol, pgoff_t ilx) +{ + int order, err; + struct folio *ret; + struct swap_cluster_info *ci; + + ci = __swap_entry_to_cluster(targ_entry); + order = highest_order(orders); + + /* orders must be non-zero, and must not exceed cluster size. */ + if (WARN_ON_ONCE(!orders || (1UL << order) > SWAPFILE_CLUSTER)) + return ERR_PTR(-EINVAL); + + do { + ret = __swap_cache_alloc(ci, targ_entry, gfp, order, + vmf, mpol, ilx); + if (!IS_ERR(ret)) + break; + err = PTR_ERR(ret); + if (!order || (err && err != -EBUSY && err != -ENOMEM)) + break; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); + order = next_order(&orders, order); + } while (orders); + + return ret; +} + +/* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock @@ -448,140 +633,64 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, } } -/** - * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. - * @entry: swap entry to be bound to the folio. - * @folio: folio to be added. - * @gfp: memory allocation flags for charge, can be 0 if @charged if true. - * @charged: if the folio is already charged. - * - * Update the swap_map and add folio as swap cache, typically before swapin. - * All swap slots covered by the folio must have a non-zero swap count. - * - * Context: Caller must protect the swap device with reference count or locks. - * Return: Returns the folio being added on success. Returns the existing folio - * if @entry is already cached. Returns NULL if raced with swapin or swapoff. - */ -static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) +static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, + struct mempolicy *mpol, pgoff_t ilx, + struct swap_iocb **plug, bool readahead) { - struct folio *swapcache = NULL; - void *shadow; - int ret; - - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) - goto failed; + struct folio *folio; - for (;;) { - ret = swap_cache_add_folio(folio, entry, &shadow); - if (!ret) - break; + do { + folio = swap_cache_get_folio(entry); + if (folio) + return folio; + folio = swap_cache_alloc_folio(entry, gfp, BIT(0), NULL, mpol, ilx); + } while (PTR_ERR(folio) == -EEXIST); - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret != -EEXIST || folio_test_large(folio)) - goto failed; + if (IS_ERR_OR_NULL(folio)) + return NULL; - swapcache = swap_cache_get_folio(entry); - if (swapcache) - goto failed; + swap_read_folio(folio, plug); + if (readahead) { + folio_set_readahead(folio); + count_vm_event(SWAP_RA); } - memcg1_swapin(entry, folio_nr_pages(folio)); - if (shadow) - workingset_refault(folio, shadow); - - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); return folio; - -failed: - folio_unlock(folio); - return swapcache; } /** - * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. - * @entry: the swapped out swap entry to be binded to the folio. - * @gfp_mask: memory allocation flags + * swapin_sync - swap-in one or multiple entries skipping readahead. + * @entry: swap entry indicating the target slot + * @gfp: memory allocation flags + * @orders: allocation orders + * @vmf: fault information * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE - * @new_page_allocated: sets true if allocation happened, false otherwise * - * Allocate a folio in the swap cache for one swap slot, typically before - * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by - * @entry must have a non-zero swap count (swapped out). - * Currently only supports order 0. + * This allocates a folio suitable for given @orders, or returns the + * existing folio in the swap cache for @entry. This initiates the IO, too, + * if needed. @entry is rounded down if @orders allow large allocation. * - * Context: Caller must protect the swap device with reference count or locks. - * Return: Returns the existing folio if @entry is cached already. Returns - * NULL if failed due to -ENOMEM or @entry have a swap count < 1. + * Context: Caller must ensure @entry is valid and pin the swap device with refcount. + * Return: Returns the folio on success, error code if failed. */ -struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) +struct folio *swapin_sync(swp_entry_t entry, gfp_t gfp, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; - struct folio *result = NULL; - *new_page_allocated = false; - /* Check the swap cache again for readahead path. */ - folio = swap_cache_get_folio(entry); - if (folio) - return folio; - - /* Skip allocation for unused and bad swap slot for readahead. */ - if (!swap_entry_swapped(si, entry)) - return NULL; - - /* Allocate a new folio to be added into the swap cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - return NULL; - /* Try add the new folio, returns existing folio or NULL on failure. */ - result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); - if (result == folio) - *new_page_allocated = true; - else - folio_put(folio); - return result; -} + do { + folio = swap_cache_get_folio(entry); + if (folio) + return folio; + folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx); + } while (PTR_ERR(folio) == -EEXIST); -/** - * swapin_folio - swap-in one or multiple entries skipping readahead. - * @entry: starting swap entry to swap in - * @folio: a new allocated and charged folio - * - * Reads @entry into @folio, @folio will be added to the swap cache. - * If @folio is a large folio, the @entry will be rounded down to align - * with the folio size. - * - * Return: returns pointer to @folio on success. If folio is a large folio - * and this raced with another swapin, NULL will be returned to allow fallback - * to order 0. Else, if another folio was already added to the swap cache, - * return that swap cache folio instead. - */ -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) -{ - struct folio *swapcache; - pgoff_t offset = swp_offset(entry); - unsigned long nr_pages = folio_nr_pages(folio); + if (IS_ERR(folio)) + return folio; - entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); - swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); - if (swapcache == folio) - swap_read_folio(folio, NULL); - return swapcache; + swap_read_folio(folio, NULL); + return folio; } /* @@ -595,7 +704,6 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug) { struct swap_info_struct *si; - bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; @@ -605,13 +713,9 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx, plug, false); mpol_cond_put(mpol); - if (page_allocated) - swap_read_folio(folio, plug); - put_swap_device(si); return folio; } @@ -696,7 +800,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx) + struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); @@ -706,7 +810,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; - bool page_allocated; + swp_entry_t ra_entry; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -723,18 +827,11 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - folio = swap_cache_alloc_folio( - swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, - &page_allocated); + ra_entry = swp_entry(swp_type(entry), offset); + folio = swap_cache_read_folio(ra_entry, gfp_mask, mpol, ilx, + &splug, offset != entry_offset); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (offset != entry_offset) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } blk_finish_plug(&plug); @@ -742,11 +839,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); - return folio; + return swap_cache_read_folio(entry, gfp_mask, mpol, ilx, NULL, false); } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, @@ -812,8 +905,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; - pgoff_t ilx; - bool page_allocated; + pgoff_t ilx = targ_ilx; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) @@ -847,19 +939,12 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!si) continue; } - folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, - &page_allocated); + folio = swap_cache_read_folio(entry, gfp_mask, mpol, ilx, + &splug, addr != vmf->address); if (si) put_swap_device(si); if (!folio) continue; - if (page_allocated) { - swap_read_folio(folio, &splug); - if (addr != vmf->address) { - folio_set_readahead(folio); - count_vm_event(SWAP_RA); - } - } folio_put(folio); } if (pte) @@ -869,10 +954,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ - folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); - if (unlikely(page_allocated)) - swap_read_folio(folio, NULL); + folio = swap_cache_read_folio(targ_entry, gfp_mask, mpol, targ_ilx, + NULL, false); return folio; } diff --git a/mm/swap_table.h b/mm/swap_table.h index 8415ffbe2b9c..e6613e62f8d0 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -11,6 +11,11 @@ struct swap_table { atomic_long_t entries[SWAPFILE_CLUSTER]; }; +/* For storing memcg private id */ +struct swap_memcg_table { + unsigned short id[SWAPFILE_CLUSTER]; +}; + #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) /* @@ -21,12 +26,14 @@ struct swap_table { * Swap table entry type and bits layouts: * * NULL: |---------------- 0 ---------------| - Free slot - * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot - * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot + * Shadow: |SWAP_COUNT|Z|---- SHADOW_VAL ---|1| - Swapped out slot + * PFN: |SWAP_COUNT|Z|------ PFN -------|10| - Cached slot * Pointer: |----------- Pointer ----------|100| - (Unused) * Bad: |------------- 1 -------------|1000| - Bad slot * - * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. + * COUNT is `SWP_TB_COUNT_BITS` long, Z is the `SWP_TB_ZERO_FLAG` bit, + * and together they form the `SWP_TB_FLAGS_BITS` wide flags field. + * Each entry is an atomic long. * * Usages: * @@ -49,14 +56,6 @@ struct swap_table { * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. */ -#if defined(MAX_POSSIBLE_PHYSMEM_BITS) -#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -#elif defined(MAX_PHYSMEM_BITS) -#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#else -#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) -#endif - /* NULL Entry, all 0 */ #define SWP_TB_NULL 0UL @@ -64,22 +63,26 @@ struct swap_table { #define SWP_TB_SHADOW_MARK 0b1UL /* Cached: PFN */ -#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS) +#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWAP_CACHE_PFN_MARK_BITS) #define SWP_TB_PFN_MARK 0b10UL -#define SWP_TB_PFN_MARK_BITS 2 -#define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) +#define SWP_TB_PFN_MARK_MASK (BIT(SWAP_CACHE_PFN_MARK_BITS) - 1) -/* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */ -#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) +/* Flags: For PFN or shadow, contains SWAP_COUNT, width changes */ +#define SWP_TB_FLAGS_BITS min(5, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_COUNT_BITS (SWP_TB_FLAGS_BITS - SWAP_TABLE_HAS_ZEROFLAG) +#define SWP_TB_FLAGS_MASK (~((~0UL) >> SWP_TB_FLAGS_BITS)) #define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) +#define SWP_TB_FLAGS_SHIFT (BITS_PER_LONG - SWP_TB_FLAGS_BITS) #define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) #define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) +/* The first flag is zero bit (SWAP_TABLE_HAS_ZEROFLAG) */ +#define SWP_TB_ZERO_FLAG BIT(BITS_PER_LONG - SWP_TB_FLAGS_BITS) /* Bad slot: ends with 0b1000 and rests of bits are all 1 */ #define SWP_TB_BAD ((~0UL) << 3) /* Macro for shadow offset calculation */ -#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS +#define SWAP_COUNT_SHIFT SWP_TB_FLAGS_BITS /* * Helpers for casting one type of info into a swap table entry. @@ -97,40 +100,47 @@ static inline unsigned long __count_to_swp_tb(unsigned char count) * used (count > 0 && count < SWP_TB_COUNT_MAX), and * overflow (count == SWP_TB_COUNT_MAX). */ - BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2); + BUILD_BUG_ON(SWP_TB_COUNT_BITS < SWAP_COUNT_MIN_BITS); VM_WARN_ON(count > SWP_TB_COUNT_MAX); return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; } -static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count) +static inline unsigned long __flags_to_swp_tb(unsigned char flags) +{ + BUILD_BUG_ON(SWP_TB_FLAGS_BITS > BITS_PER_BYTE); + VM_WARN_ON(flags >> SWP_TB_FLAGS_BITS); + return ((unsigned long)flags) << SWP_TB_FLAGS_SHIFT; +} + +static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned char flags) { unsigned long swp_tb; BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > - (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); + (BITS_PER_LONG - SWAP_CACHE_PFN_MARK_BITS - SWP_TB_FLAGS_BITS)); - swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; - VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); + swp_tb = (pfn << SWAP_CACHE_PFN_MARK_BITS) | SWP_TB_PFN_MARK; + VM_WARN_ON_ONCE(swp_tb & SWP_TB_FLAGS_MASK); - return swp_tb | __count_to_swp_tb(count); + return swp_tb | __flags_to_swp_tb(flags); } -static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count) +static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned char flags) { - return pfn_to_swp_tb(folio_pfn(folio), count); + return pfn_to_swp_tb(folio_pfn(folio), flags); } -static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count) +static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned char flags) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(unsigned long)); BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); - VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_FLAGS_MASK)); - return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK; + return (unsigned long)shadow | SWP_TB_SHADOW_MARK | __flags_to_swp_tb(flags); } /* @@ -168,14 +178,14 @@ static inline bool swp_tb_is_countable(unsigned long swp_tb) static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); - return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); + return pfn_folio((swp_tb & ~SWP_TB_FLAGS_MASK) >> SWAP_CACHE_PFN_MARK_BITS); } static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); /* No shift needed, xa_value is stored as it is in the lower bits. */ - return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); + return (void *)(swp_tb & ~SWP_TB_FLAGS_MASK); } static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) @@ -184,6 +194,12 @@ static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); } +static inline unsigned char __swp_tb_get_flags(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return ((swp_tb & SWP_TB_FLAGS_MASK) >> SWP_TB_FLAGS_SHIFT); +} + static inline int swp_tb_get_count(unsigned long swp_tb) { if (swp_tb_is_countable(swp_tb)) @@ -247,4 +263,107 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci, return swp_tb; } + +static inline void __swap_table_set_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + BUILD_BUG_ON(SWP_TB_ZERO_FLAG & ~SWP_TB_FLAGS_MASK); + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb |= SWP_TB_ZERO_FLAG; + __swap_table_set(ci, ci_off, swp_tb); +#else + lockdep_assert_held(&ci->lock); + __set_bit(ci_off, ci->zero_bitmap); +#endif +} + +static inline bool __swap_table_test_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return !!(swp_tb & SWP_TB_ZERO_FLAG); +#else + return test_bit(ci_off, ci->zero_bitmap); +#endif +} + +static inline void __swap_table_clear_zero(struct swap_cluster_info *ci, + unsigned int ci_off) +{ +#if SWAP_TABLE_HAS_ZEROFLAG + unsigned long swp_tb = __swap_table_get(ci, ci_off); + + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + swp_tb &= ~SWP_TB_ZERO_FLAG; + __swap_table_set(ci, ci_off, swp_tb); +#else + lockdep_assert_held(&ci->lock); + __clear_bit(ci_off, ci->zero_bitmap); +#endif +} + +#ifdef CONFIG_MEMCG +static inline void __swap_cgroup_set(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned long nr, unsigned short id) +{ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); + if (WARN_ON_ONCE(!ci->memcg_table)) + return; + do { + ci->memcg_table->id[ci_off++] = id; + } while (--nr); +} + +static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER); + if (unlikely(!ci->memcg_table)) + return 0; + return ci->memcg_table->id[ci_off]; +} + +static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, + unsigned int ci_off, + unsigned long nr) +{ + unsigned short old = __swap_cgroup_get(ci, ci_off); + + if (!old) + return 0; + do { + VM_WARN_ON_ONCE(ci->memcg_table->id[ci_off] != old); + ci->memcg_table->id[ci_off++] = 0; + } while (--nr); + + return old; +} +#else +static inline void __swap_cgroup_set(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned long nr, unsigned short id) +{ +} + +static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + return 0; +} + +static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci, + unsigned int ci_off, + unsigned long nr) +{ + return 0; +} +#endif + #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 9174f1eeffb0..78b49b0658ad 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,6 @@ #include <asm/tlbflush.h> #include <linux/leafops.h> -#include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" #include "swap.h" @@ -133,7 +132,7 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { /* May return NULL on invalid type, caller must check for NULL return */ static struct swap_info_struct *swap_type_to_info(int type) { - if (type >= MAX_SWAPFILES) + if (type < 0 || type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } @@ -411,20 +410,7 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static struct swap_table *swap_table_alloc(gfp_t gfp) -{ - struct folio *folio; - - if (!SWP_TABLE_USE_PAGE) - return kmem_cache_zalloc(swap_table_cachep, gfp); - - folio = folio_alloc(gfp | __GFP_ZERO, 0); - if (folio) - return folio_address(folio); - return NULL; -} - -static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +static void swap_cluster_free_table_folio_rcu_cb(struct rcu_head *head) { struct folio *folio; @@ -432,15 +418,76 @@ static void swap_table_free_folio_rcu_cb(struct rcu_head *head) folio_put(folio); } -static void swap_table_free(struct swap_table *table) +static void swap_cluster_free_table(struct swap_cluster_info *ci) { + struct swap_table *table; + +#ifdef CONFIG_MEMCG + kfree(ci->memcg_table); + ci->memcg_table = NULL; +#endif + +#if !SWAP_TABLE_HAS_ZEROFLAG + kfree(ci->zero_bitmap); + ci->zero_bitmap = NULL; +#endif + + table = (struct swap_table *)rcu_access_pointer(ci->table); + if (!table) + return; + + rcu_assign_pointer(ci->table, NULL); if (!SWP_TABLE_USE_PAGE) { kmem_cache_free(swap_table_cachep, table); return; } call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), - swap_table_free_folio_rcu_cb); + swap_cluster_free_table_folio_rcu_cb); +} + +static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp) +{ + struct swap_table *table = NULL; + struct folio *folio; + + /* The cluster must be empty and not on any list during allocation. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + if (rcu_access_pointer(ci->table)) + return 0; + + if (SWP_TABLE_USE_PAGE) { + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + table = folio_address(folio); + } else { + table = kmem_cache_zalloc(swap_table_cachep, gfp); + } + if (!table) + return -ENOMEM; + + rcu_assign_pointer(ci->table, table); + +#ifdef CONFIG_MEMCG + if (!mem_cgroup_disabled()) { + VM_WARN_ON_ONCE(ci->memcg_table); + ci->memcg_table = kzalloc_obj(*ci->memcg_table, gfp); + if (!ci->memcg_table) { + swap_cluster_free_table(ci); + return -ENOMEM; + } + } +#endif + +#if !SWAP_TABLE_HAS_ZEROFLAG + VM_WARN_ON_ONCE(ci->zero_bitmap); + ci->zero_bitmap = bitmap_zalloc(SWAPFILE_CLUSTER, gfp); + if (!ci->zero_bitmap) { + swap_cluster_free_table(ci); + return -ENOMEM; + } +#endif + return 0; } /* @@ -465,33 +512,22 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, bad_slots++; else WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off)); } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } -static void swap_cluster_free_table(struct swap_cluster_info *ci) -{ - struct swap_table *table; - - /* Only empty cluster's table is allow to be freed */ - lockdep_assert_held(&ci->lock); - table = (void *)rcu_dereference_protected(ci->table, true); - rcu_assign_pointer(ci->table, NULL); - - swap_table_free(table); -} - /* * Allocate swap table for one cluster. Attempt an atomic allocation first, * then fallback to sleeping allocation. */ static struct swap_cluster_info * -swap_cluster_alloc_table(struct swap_info_struct *si, +swap_cluster_populate(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_table *table; + int ret; /* * Only cluster isolation from the allocator does table allocation. @@ -502,14 +538,9 @@ swap_cluster_alloc_table(struct swap_info_struct *si, lockdep_assert_held(&si->global_cluster_lock); lockdep_assert_held(&ci->lock); - /* The cluster must be free and was just isolated from the free list. */ - VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); - - table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (table) { - rcu_assign_pointer(ci->table, table); + if (!swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC | + __GFP_NOWARN)) return ci; - } /* * Try a sleep allocation. Each isolated free cluster may cause @@ -521,7 +552,8 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_unlock(&si->global_cluster_lock); local_unlock(&percpu_swap_cluster.lock); - table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + ret = swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC | + GFP_KERNEL); /* * Back to atomic context. We might have migrated to a new CPU with a @@ -536,20 +568,11 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_lock(&si->global_cluster_lock); spin_lock(&ci->lock); - /* Nothing except this helper should touch a dangling empty cluster. */ - if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { - if (table) - swap_table_free(table); - return ci; - } - - if (!table) { + if (ret) { move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); spin_unlock(&ci->lock); return NULL; } - - rcu_assign_pointer(ci->table, table); return ci; } @@ -621,12 +644,11 @@ static struct swap_cluster_info *isolate_lock_cluster( } spin_unlock(&si->lock); - if (found && !cluster_table_is_alloced(found)) { - /* Only an empty free cluster's swap table can be freed. */ - VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE); + /* Cluster's table is freed when and only when it's on the free list. */ + if (found && flags == CLUSTER_FLAG_FREE) { VM_WARN_ON_ONCE(list != &si->free_clusters); - VM_WARN_ON_ONCE(!cluster_is_empty(found)); - return swap_cluster_alloc_table(si, found); + VM_WARN_ON_ONCE(cluster_table_is_alloced(found)); + return swap_cluster_populate(si, found); } return found; @@ -769,7 +791,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - struct swap_table *table; int ret = 0; /* si->max may got shrunk by swap swap_activate() */ @@ -790,12 +811,9 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, } ci = cluster_info + idx; - if (!ci->table) { - table = swap_table_alloc(GFP_KERNEL); - if (!table) - return -ENOMEM; - rcu_assign_pointer(ci->table, table); - } + /* Need to allocate swap table first for initial bad slot marking. */ + if (!ci->count && swap_cluster_alloc_table(ci, GFP_KERNEL)) + return -ENOMEM; spin_lock(&ci->lock); /* Check for duplicated bad swap slots. */ if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) { @@ -922,8 +940,8 @@ static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, order = 0; nr_pages = 1; swap_cluster_assert_empty(ci, ci_off, 1, false); - /* Sets a fake shadow as placeholder */ - __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); + /* Fake shadow placeholder with no flag, hibernation does not use the zeromap */ + __swap_table_set(ci, ci_off, __swp_tb_mk_count(shadow_to_swp_tb(NULL, 0), 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -1054,6 +1072,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) swap_cluster_unlock(ci); if (to_scan <= 0) break; + cond_resched(); } } @@ -1295,14 +1314,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; - /* - * Use atomic clear_bit operations only on zeromap instead of non-atomic - * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. - */ - for (i = 0; i < nr_entries; i++) { - clear_bit(offset + i, si->zeromap); + for (i = 0; i < nr_entries; i++) zswap_invalidate(swp_entry(si->type, offset + i)); - } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = @@ -1442,8 +1455,10 @@ start_over: } static int swap_extend_table_alloc(struct swap_info_struct *si, - struct swap_cluster_info *ci, gfp_t gfp) + struct swap_cluster_info *ci, + unsigned int ci_off, gfp_t gfp) { + int count; void *table; table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); @@ -1451,11 +1466,27 @@ static int swap_extend_table_alloc(struct swap_info_struct *si, return -ENOMEM; spin_lock(&ci->lock); - if (!ci->extend_table) - ci->extend_table = table; - else - kfree(table); + /* + * Extend table allocation requires releasing ci lock first so it's + * possible that the slot has been freed, no longer overflowed, or + * a concurrent extend table allocation has already succeeded, so + * the allocation is no longer needed. + */ + if (!cluster_table_is_alloced(ci)) + goto out_free; + count = swp_tb_get_count(__swap_table_get(ci, ci_off)); + if (count < (SWP_TB_COUNT_MAX - 1)) + goto out_free; + if (ci->extend_table) + goto out_free; + + ci->extend_table = table; + spin_unlock(&ci->lock); + return 0; + +out_free: spin_unlock(&ci->lock); + kfree(table); return 0; } @@ -1471,7 +1502,7 @@ int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) return 0; ci = __swap_offset_to_cluster(si, offset); - ret = swap_extend_table_alloc(si, ci, gfp); + ret = swap_extend_table_alloc(si, ci, swp_cluster_offset(entry), gfp); put_swap_device(si); return ret; @@ -1518,13 +1549,21 @@ static void __swap_cluster_put_entry(struct swap_cluster_info *ci, if (count == (SWP_TB_COUNT_MAX - 1)) { ci->extend_table[ci_off] = 0; __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); - swap_extend_table_try_free(ci); } else { ci->extend_table[ci_off] = count; } } else { __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); } + + /* + * `SWP_TB_COUNT_MAX - 1` triggers extend table allocation. If the + * count was above that, then the extend table is no longer needed, + * so free it. And if we just put the count value from MAX - 1, it's + * also possible that a pending dup just attached an extend table. + */ + if (unlikely(count == SWP_TB_COUNT_MAX - 2 || count == SWP_TB_COUNT_MAX - 1)) + swap_extend_table_try_free(ci); } /** @@ -1664,7 +1703,7 @@ restart: if (unlikely(err)) { if (err == -ENOMEM) { spin_unlock(&ci->lock); - err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + err = swap_extend_table_alloc(si, ci, ci_off, GFP_ATOMIC); spin_lock(&ci->lock); if (!err) goto restart; @@ -1730,7 +1769,7 @@ again: } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) + if (unlikely(mem_cgroup_try_charge_swap(folio))) swap_cache_del_folio(folio); if (unlikely(!folio_test_swapcache(folio))) @@ -1826,8 +1865,7 @@ void folio_put_swap(struct folio *folio, struct page *subpage) * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swap_cache_add_folio() - * // check swap_map + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before @@ -1873,21 +1911,44 @@ void __swap_cluster_free_entries(struct swap_info_struct *si, unsigned int ci_start, unsigned int nr_pages) { unsigned long old_tb; + unsigned short batch_id = 0, id_cur; unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; - unsigned long offset = cluster_offset(si, ci) + ci_start; + unsigned long ci_head = cluster_offset(si, ci); + unsigned int batch_off = ci_off; VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { old_tb = __swap_table_get(ci, ci_off); - /* Release the last ref, or after swap cache is dropped */ + /* + * Freeing is done after release of the last swap count + * ref, or after swap cache is dropped + */ VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + + /* Resetting the slot to NULL also clears the inline flags. */ __swap_table_set(ci, ci_off, null_to_swp_tb()); + if (!SWAP_TABLE_HAS_ZEROFLAG) + __swap_table_clear_zero(ci, ci_off); + + /* + * Uncharge swap slots by memcg in batches. Consecutive + * slots with the same cgroup id are uncharged together. + */ + id_cur = __swap_cgroup_clear(ci, ci_off, 1); + if (batch_id != id_cur) { + if (batch_id) + mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off); + batch_id = id_cur; + batch_off = ci_off; + } } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); - swap_range_free(si, offset, nr_pages); + if (batch_id) + mem_cgroup_uncharge_swap(batch_id, ci_off - batch_off); + + swap_range_free(si, ci_head + ci_start, nr_pages); swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) @@ -2077,7 +2138,16 @@ out: } #ifdef CONFIG_HIBERNATION -/* Allocate a slot for hibernation */ +/** + * swap_alloc_hibernation_slot() - Allocate a swap slot for hibernation. + * @type: swap device type index to allocate from. + * + * The caller must ensure the swap device is stable, either by pinning + * it (SWP_HIBERNATION) or by freezing user-space. + * + * Return: a valid swp_entry_t on success, or an empty entry (val == 0) + * on failure. + */ swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *pcp_si, *si = swap_type_to_info(type); @@ -2088,46 +2158,42 @@ swp_entry_t swap_alloc_hibernation_slot(int type) if (!si) goto fail; - /* This is called for allocating swap entry, not cache */ - if (get_swap_device_info(si)) { - if (si->flags & SWP_WRITEOK) { - /* - * Try the local cluster first if it matches the device. If - * not, try grab a new cluster and override local cluster. - */ - local_lock(&percpu_swap_cluster.lock); - pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); - pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); - if (pcp_si == si && pcp_offset) { - ci = swap_cluster_lock(si, pcp_offset); - if (cluster_is_usable(ci, 0)) - offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); - else - swap_cluster_unlock(ci); - } - if (!offset) - offset = cluster_alloc_swap_entry(si, NULL); - local_unlock(&percpu_swap_cluster.lock); - if (offset) - entry = swp_entry(si->type, offset); - } - put_swap_device(si); + /* + * Try the local cluster first if it matches the device. If + * not, try grab a new cluster and override local cluster. + */ + local_lock(&percpu_swap_cluster.lock); + pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); + pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); + if (pcp_si == si && pcp_offset) { + ci = swap_cluster_lock(si, pcp_offset); + if (cluster_is_usable(ci, 0)) + offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); + else + swap_cluster_unlock(ci); } + if (!offset) + offset = cluster_alloc_swap_entry(si, NULL); + local_unlock(&percpu_swap_cluster.lock); + if (offset) + entry = swp_entry(si->type, offset); + fail: return entry; } -/* Free a slot allocated by swap_alloc_hibernation_slot */ +/** + * swap_free_hibernation_slot() - Free a swap slot allocated for hibernation. + * @entry: swap entry to free. + * + * The caller must ensure the swap device is stable. + */ void swap_free_hibernation_slot(swp_entry_t entry) { - struct swap_info_struct *si; + struct swap_info_struct *si = __swap_entry_to_info(entry); struct swap_cluster_info *ci; pgoff_t offset = swp_offset(entry); - si = get_swap_device(entry); - if (WARN_ON(!si)) - return; - ci = swap_cluster_lock(si, offset); __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); @@ -2135,25 +2201,17 @@ void swap_free_hibernation_slot(swp_entry_t entry) /* In theory readahead might add it to the swap cache by accident */ __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); - put_swap_device(si); } -/* - * Find the swap type that corresponds to given device (if any). - * - * @offset - number of the PAGE_SIZE-sized block of the device, starting - * from 0, in which the swap header is expected to be located. - * - * This is needed for the suspend to disk (aka swsusp). - */ -int swap_type_of(dev_t device, sector_t offset) +static int __find_hibernation_swap_type(dev_t device, sector_t offset) { int type; + lockdep_assert_held(&swap_lock); + if (!device) - return -1; + return -EINVAL; - spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; @@ -2163,16 +2221,118 @@ int swap_type_of(dev_t device, sector_t offset) if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); - if (se->start_block == offset) { - spin_unlock(&swap_lock); + if (se->start_block == offset) return type; - } } } - spin_unlock(&swap_lock); return -ENODEV; } +/** + * pin_hibernation_swap_type - Pin the swap device for hibernation + * @device: Block device containing the resume image + * @offset: Offset identifying the swap area + * + * Locate the swap device for @device/@offset and mark it as pinned + * for hibernation. While pinned, swapoff() is prevented. + * + * Only one uswsusp context may pin a swap device at a time. + * If already pinned, this function returns -EBUSY. + * + * Return: + * >= 0 on success (swap type). + * -EINVAL if @device is invalid. + * -ENODEV if the swap device is not found. + * -EBUSY if the device is already pinned for hibernation. + */ +int pin_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + struct swap_info_struct *si; + + spin_lock(&swap_lock); + + type = __find_hibernation_swap_type(device, offset); + if (type < 0) { + spin_unlock(&swap_lock); + return type; + } + + si = swap_type_to_info(type); + if (WARN_ON_ONCE(!si)) { + spin_unlock(&swap_lock); + return -ENODEV; + } + + /* + * hibernate_acquire() prevents concurrent hibernation sessions. + * This check additionally guards against double-pinning within + * the same session. + */ + if (WARN_ON_ONCE(si->flags & SWP_HIBERNATION)) { + spin_unlock(&swap_lock); + return -EBUSY; + } + + si->flags |= SWP_HIBERNATION; + + spin_unlock(&swap_lock); + return type; +} + +/** + * unpin_hibernation_swap_type - Unpin the swap device for hibernation + * @type: Swap type previously returned by pin_hibernation_swap_type() + * + * Clear the hibernation pin on the given swap device, allowing + * swapoff() to proceed normally. + * + * If @type does not refer to a valid swap device, this function + * does nothing. + */ +void unpin_hibernation_swap_type(int type) +{ + struct swap_info_struct *si; + + spin_lock(&swap_lock); + si = swap_type_to_info(type); + if (!si) { + spin_unlock(&swap_lock); + return; + } + si->flags &= ~SWP_HIBERNATION; + spin_unlock(&swap_lock); +} + +/** + * find_hibernation_swap_type - Find swap type for hibernation + * @device: Block device containing the resume image + * @offset: Offset within the device identifying the swap area + * + * Locate the swap device corresponding to @device and @offset. + * + * Unlike pin_hibernation_swap_type(), this function only performs a + * lookup and does not mark the swap device as pinned for hibernation. + * + * This is safe in the sysfs-based hibernation path where user space + * is already frozen and swapoff() cannot run concurrently. + * + * Return: + * A non-negative swap type on success. + * -EINVAL if @device is invalid. + * -ENODEV if no matching swap device is found. + */ +int find_hibernation_swap_type(dev_t device, sector_t offset) +{ + int type; + + spin_lock(&swap_lock); + type = __find_hibernation_swap_type(device, offset); + spin_unlock(&swap_lock); + + return type; +} + int find_first_swap(dev_t *device) { int type; @@ -2869,7 +3029,7 @@ static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, ci = cluster_info + i; /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); - if (rcu_dereference_protected(ci->table, true)) { + if (cluster_table_is_alloced(ci)) { swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true); swap_cluster_free_table(ci); } @@ -2903,7 +3063,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -2936,6 +3095,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } + + /* Refuse swapoff while the device is pinned for hibernation */ + if (p->flags & SWP_HIBERNATION) { + err = -EBUSY; + spin_unlock(&swap_lock); + goto out_dput; + } + if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { @@ -2991,8 +3158,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - zeromap = p->zeromap; - p->zeromap = NULL; maxpages = p->max; cluster_info = p->cluster_info; p->max = 0; @@ -3004,10 +3169,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); - /* Destroy swap account information */ - swap_cgroup_swapoff(p->type); inode = mapping->host; @@ -3538,21 +3700,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - error = swap_cgroup_swapon(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - - /* - * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might - * be above MAX_PAGE_ORDER incase of a large swap file. - */ - si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), - GFP_KERNEL | __GFP_ZERO); - if (!si->zeromap) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; @@ -3652,11 +3799,8 @@ bad_swap: si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si, swap_file); - swap_cgroup_swapoff(si->type); free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; - kvfree(si->zeromap); - si->zeromap = NULL; /* * Clear the SWP_USED flag after all resources are freed so * alloc_swap_info can reuse this si safely. diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 80cc8be5725f..246af12bf801 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -2,7 +2,12 @@ /* * mm/userfaultfd.c * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2008-2009 Red Hat, Inc. * Copyright (C) 2015 Red Hat, Inc. + * + * Some part derived from fs/eventfd.c (anon inode setup) and + * mm/ksm.c (mm hashing). */ #include <linux/mm.h> @@ -14,6 +19,17 @@ #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> +#include <linux/list.h> +#include <linux/sched/mm.h> +#include <linux/mm_inline.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/bug.h> +#include <linux/anon_inodes.h> +#include <linux/syscalls.h> +#include <linux/miscdevice.h> +#include <linux/uio.h> #include <linux/file.h> #include <linux/cleanup.h> #include <asm/tlbflush.h> @@ -1017,7 +1033,7 @@ out: return copied ? copied : err; } -ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, +static ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { @@ -1025,7 +1041,7 @@ ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } -ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, +static ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len) { @@ -1033,7 +1049,7 @@ ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } -ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, +static ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { @@ -1049,7 +1065,7 @@ ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } -ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, +static ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, start, 0, len, @@ -1085,7 +1101,7 @@ long uffd_wp_range(struct vm_area_struct *dst_vma, return ret; } -int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, +static int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp) { struct mm_struct *dst_mm = ctx->mm; @@ -1915,7 +1931,7 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. */ -ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, +static ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) { struct mm_struct *mm = ctx->mm; @@ -2090,7 +2106,7 @@ out: return moved ? moved : err; } -bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, +static bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { const struct vm_uffd_ops *ops = vma_uffd_ops(vma); @@ -2147,12 +2163,12 @@ static void userfaultfd_set_ctx(struct vm_area_struct *vma, (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } -void userfaultfd_reset_ctx(struct vm_area_struct *vma) +static void userfaultfd_reset_ctx(struct vm_area_struct *vma) { userfaultfd_set_ctx(vma, NULL, 0); } -struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, +static struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, @@ -2191,7 +2207,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, } /* Assumes mmap write lock taken, and mm_struct pinned. */ -int userfaultfd_register_range(struct userfaultfd_ctx *ctx, +static int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, vm_flags_t vm_flags, unsigned long start, unsigned long end, @@ -2255,7 +2271,7 @@ skip: return 0; } -void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +static void userfaultfd_release_new(struct userfaultfd_ctx *ctx) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma; @@ -2270,7 +2286,7 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx) mmap_write_unlock(mm); } -void userfaultfd_release_all(struct mm_struct *mm, +static void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx) { struct vm_area_struct *vma, *prev; @@ -2305,3 +2321,2222 @@ void userfaultfd_release_all(struct mm_struct *mm, mmap_write_unlock(mm); mmput(mm); } + +static int sysctl_unprivileged_userfaultfd __read_mostly; + +#ifdef CONFIG_SYSCTL +static const struct ctl_table vm_userfaultfd_table[] = { + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; +#endif + +static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; + +struct userfaultfd_fork_ctx { + struct userfaultfd_ctx *orig; + struct userfaultfd_ctx *new; + struct list_head list; +}; + +struct userfaultfd_unmap_ctx { + struct userfaultfd_ctx *ctx; + unsigned long start; + unsigned long end; + struct list_head list; +}; + +struct userfaultfd_wait_queue { + struct uffd_msg msg; + wait_queue_entry_t wq; + struct userfaultfd_ctx *ctx; + bool waken; +}; + +struct userfaultfd_wake_range { + unsigned long start; + unsigned long len; +}; + +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, + int wake_flags, void *key) +{ + struct userfaultfd_wake_range *range = key; + int ret; + struct userfaultfd_wait_queue *uwq; + unsigned long start, len; + + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + ret = 0; + /* len == 0 means wake all */ + start = range->start; + len = range->len; + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) + goto out; + WRITE_ONCE(uwq->waken, true); + /* + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. + */ + ret = wake_up_state(wq->private, mode); + if (ret) { + /* + * Wake only once, autoremove behavior. + * + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). + */ + list_del_init(&wq->entry); + } +out: + return ret; +} + +/** + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to the userfaultfd context. + */ +static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) +{ + refcount_inc(&ctx->refcount); +} + +/** + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to userfaultfd context. + * + * The userfaultfd context reference must have been previously acquired either + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). + */ +static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->refcount)) { + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); + mmdrop(ctx->mm); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); + } +} + +static inline void msg_init(struct uffd_msg *msg) +{ + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, + unsigned int flags, + unsigned long reason, + unsigned int features) +{ + struct uffd_msg msg; + + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + + /* + * These flags indicate why the userfault occurred: + * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. + * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. + * - Neither of these flags being set indicates a MISSING fault. + * + * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write + * fault. Otherwise, it was a read fault. + */ + if (flags & FAULT_FLAG_WRITE) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; + if (reason & VM_UFFD_WP) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (reason & VM_UFFD_MINOR) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; + if (features & UFFD_FEATURE_THREAD_ID) + msg.arg.pagefault.feat.ptid = task_pid_vnr(current); + return msg; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Same functionality as userfaultfd_must_wait below with modifications for + * hugepmd ranges. + */ +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + struct vm_area_struct *vma = vmf->vma; + pte_t *ptep, pte; + + assert_fault_locked(vmf); + + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); + if (!ptep) + return true; + + pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(pte)) + return true; + /* + * Concurrent migration may have replaced the present PTE with a + * non-marker swap entry between fault delivery and this lockless + * re-check. huge_pte_write() on a swap entry decodes random offset + * bits, so gate it on pte_present(). The migration completion path + * will re-deliver the fault if it still needs userspace. + */ + if (!pte_present(pte)) + return false; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ + if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) + return true; + + return false; +} +#else +static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Verify the pagetables are still not ok after having registered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read_iter and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + struct vm_fault *vmf, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pte_t ptent; + bool ret; + + assert_fault_locked(vmf); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return true; + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return true; + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return true; + pmd = pmd_offset(pud, address); +again: + _pmd = pmdp_get_lockless(pmd); + if (pmd_none(_pmd)) + return true; + + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ + if (!pmd_present(_pmd)) + return false; + + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); + + pte = pte_offset_map(pmd, address); + if (!pte) + goto again; + + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + ptent = ptep_get(pte); + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(ptent)) + goto out; + /* + * Concurrent swap-out / migration may have replaced the present PTE + * with a non-marker swap entry between fault delivery and this + * lockless re-check. pte_write() on a swap entry decodes random + * offset bits, so gate it on pte_present(). The page-in path will + * re-deliver the fault if it still needs userspace. + */ + if (!pte_present(ptent)) { + ret = false; + goto out; + } + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) + goto out; + + ret = false; +out: + pte_unmap(pte); + return ret; +} + +static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return TASK_INTERRUPTIBLE; + + if (flags & FAULT_FLAG_KILLABLE) + return TASK_KILLABLE; + + return TASK_UNINTERRUPTIBLE; +} + +/* + * The locking rules involved in returning VM_FAULT_RETRY depending on + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" + * recommendation in __lock_page_or_retry is not an understatement. + * + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is + * not set. + * + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not + * set, VM_FAULT_RETRY can still be returned if and only if there are + * fatal_signal_pending()s, and the mmap_lock must be released before + * returning it. + */ +vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue uwq; + vm_fault_t ret = VM_FAULT_SIGBUS; + bool must_wait; + unsigned int blocking_state; + + /* + * We don't do userfault handling for the final child pid update + * and when coredumping (faults triggered by get_dump_page()). + */ + if (current->flags & (PF_EXITING|PF_DUMPCORE)) + goto out; + + assert_fault_locked(vmf); + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) + goto out; + + VM_WARN_ON_ONCE(ctx->mm != mm); + + /* Any unrecognized flag is a bug. */ + VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); + /* 0 or > 1 flags set is a bug; we expect exactly 1. */ + VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); + + if (ctx->features & UFFD_FEATURE_SIGBUS) + goto out; + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) + goto out; + + /* + * Check that we can return VM_FAULT_RETRY. + * + * NOTE: it should become possible to return VM_FAULT_RETRY + * even if FAULT_FLAG_TRIED is set without leading to gup() + * -EBUSY failures, if the userfaultfd is to be extended for + * VM_UFFD_WP tracking and we intend to arm the userfault + * without first stopping userland access to the memory. For + * VM_UFFD_MISSING userfaults this is enough for now. + */ + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { + /* + * Validate the invariant that nowait must allow retry + * to be sure not to return SIGBUS erroneously on + * nowait invocations. + */ + VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); +#ifdef CONFIG_DEBUG_VM + if (printk_ratelimit()) { + pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); + dump_stack(); + } +#endif + goto out; + } + + /* + * Handle nowait, not much to do other than tell it to retry + * and wait. + */ + ret = VM_FAULT_RETRY; + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + goto out; + + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + + /* take the reference before dropping the mmap_lock */ + userfaultfd_ctx_get(ctx); + + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); + uwq.wq.private = current; + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); + uwq.ctx = ctx; + uwq.waken = false; + + blocking_state = userfaultfd_get_blocking_state(vmf->flags); + + /* + * Take the vma lock now, in order to safely call + * userfaultfd_huge_must_wait() later. Since acquiring the + * (sleepable) vma lock can modify the current task state, that + * must be before explicitly calling set_current_state(). + */ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); + + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); + /* + * The smp_mb() after __set_current_state prevents the reads + * following the spin_unlock to happen before the list_add in + * __add_wait_queue. + */ + set_current_state(blocking_state); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + if (is_vm_hugetlb_page(vma)) { + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); + hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + + release_fault_lock(vmf); + + if (likely(must_wait && !READ_ONCE(ctx->released))) { + wake_up_poll(&ctx->fd_wqh, EPOLLIN); + schedule(); + } + + __set_current_state(TASK_RUNNING); + + /* + * Here we race with the list_del; list_add in + * userfaultfd_ctx_read(), however because we don't ever run + * list_del_init() to refile across the two lists, the prev + * and next pointers will never point to self. list_add also + * would never let any of the two pointers to point to + * self. So list_empty_careful won't risk to see both pointers + * pointing to self at any time during the list refile. The + * only case where list_del_init() is called is the full + * removal in the wake function and there we don't re-list_add + * and it's fine not to block on the spinlock. The uwq on this + * kernel stack can be released after the list_del_init. + */ + if (!list_empty_careful(&uwq.wq.entry)) { + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* + * No need of list_del_init(), the uwq on the stack + * will be freed shortly anyway. + */ + list_del(&uwq.wq.entry); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + userfaultfd_ctx_put(ctx); + +out: + return ret; +} + +static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + struct userfaultfd_ctx *release_new_ctx; + + if (WARN_ON_ONCE(current->flags & PF_EXITING)) + goto out; + + ewq->ctx = ctx; + init_waitqueue_entry(&ewq->wq, current); + release_new_ctx = NULL; + + spin_lock_irq(&ctx->event_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->event_wqh, &ewq->wq); + for (;;) { + set_current_state(TASK_KILLABLE); + if (ewq->msg.event == 0) + break; + if (READ_ONCE(ctx->released) || + fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + if (ewq->msg.event == UFFD_EVENT_FORK) { + struct userfaultfd_ctx *new; + + new = (struct userfaultfd_ctx *) + (unsigned long) + ewq->msg.arg.reserved.reserved1; + release_new_ctx = new; + } + break; + } + + spin_unlock_irq(&ctx->event_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, EPOLLIN); + schedule(); + + spin_lock_irq(&ctx->event_wqh.lock); + } + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&ctx->event_wqh.lock); + + if (release_new_ctx) { + userfaultfd_release_new(release_new_ctx); + userfaultfd_ctx_put(release_new_ctx); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ +out: + atomic_dec(&ctx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); + userfaultfd_ctx_put(ctx); +} + +static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) +{ + ewq->msg.event = 0; + wake_up_locked(&ctx->event_wqh); + __remove_wait_queue(&ctx->event_wqh, &ewq->wq); +} + +int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) +{ + struct userfaultfd_ctx *ctx = NULL, *octx; + struct userfaultfd_fork_ctx *fctx; + + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx) + return 0; + + if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) { + userfaultfd_reset_ctx(vma); + return 0; + } + + list_for_each_entry(fctx, fcs, list) + if (fctx->orig == octx) { + ctx = fctx->new; + break; + } + + if (!ctx) { + fctx = kmalloc_obj(*fctx); + if (!fctx) + return -ENOMEM; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) { + kfree(fctx); + return -ENOMEM; + } + + refcount_set(&ctx->refcount, 1); + ctx->flags = octx->flags; + ctx->features = octx->features; + ctx->released = false; + init_rwsem(&ctx->map_changing_lock); + atomic_set(&ctx->mmap_changing, 0); + ctx->mm = vma->vm_mm; + mmgrab(ctx->mm); + + userfaultfd_ctx_get(octx); + down_write(&octx->map_changing_lock); + atomic_inc(&octx->mmap_changing); + up_write(&octx->map_changing_lock); + fctx->orig = octx; + fctx->new = ctx; + list_add_tail(&fctx->list, fcs); + } + + vma->vm_userfaultfd_ctx.ctx = ctx; + return 0; +} + +static void dup_fctx(struct userfaultfd_fork_ctx *fctx) +{ + struct userfaultfd_ctx *ctx = fctx->orig; + struct userfaultfd_wait_queue ewq; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_FORK; + ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void dup_userfaultfd_complete(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + list_for_each_entry_safe(fctx, n, fcs, list) { + dup_fctx(fctx); + list_del(&fctx->list); + kfree(fctx); + } +} + +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + +void mremap_userfaultfd_prep(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx; + + ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { + vm_ctx->ctx = ctx; + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + } else { + /* Drop uffd context if remap feature not enabled */ + userfaultfd_reset_ctx(vma); + } +} + +void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, + unsigned long from, unsigned long to, + unsigned long len) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + struct userfaultfd_wait_queue ewq; + + if (!ctx) + return; + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMAP; + ewq.msg.arg.remap.from = from; + ewq.msg.arg.remap.to = to; + ewq.msg.arg.remap.len = len; + + userfaultfd_event_wait_completion(ctx, &ewq); +} + +void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + + if (!ctx) + return; + + atomic_dec(&ctx->mmap_changing); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); + userfaultfd_ctx_put(ctx); +} + +bool userfaultfd_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue ewq; + + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) + return true; + + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + mmap_read_unlock(mm); + + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_REMOVE; + ewq.msg.arg.remove.start = start; + ewq.msg.arg.remove.end = end; + + userfaultfd_event_wait_completion(ctx, &ewq); + + return false; +} + +static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, + unsigned long start, unsigned long end) +{ + struct userfaultfd_unmap_ctx *unmap_ctx; + + list_for_each_entry(unmap_ctx, unmaps, list) + if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && + unmap_ctx->end == end) + return true; + + return false; +} + +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct list_head *unmaps) +{ + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; + + unmap_ctx = kzalloc_obj(*unmap_ctx); + if (!unmap_ctx) + return -ENOMEM; + + userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); + atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); + + return 0; +} + +void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) +{ + struct userfaultfd_unmap_ctx *ctx, *n; + struct userfaultfd_wait_queue ewq; + + list_for_each_entry_safe(ctx, n, uf, list) { + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_UNMAP; + ewq.msg.arg.remove.start = ctx->start; + ewq.msg.arg.remove.end = ctx->end; + + userfaultfd_event_wait_completion(ctx->ctx, &ewq); + + list_del(&ctx->list); + kfree(ctx); + } +} + +static int userfaultfd_release(struct inode *inode, struct file *file) +{ + struct userfaultfd_ctx *ctx = file->private_data; + struct mm_struct *mm = ctx->mm; + /* len == 0 means wake all */ + struct userfaultfd_wake_range range = { .len = 0, }; + + WRITE_ONCE(ctx->released, true); + + userfaultfd_release_all(mm, ctx); + + /* + * After no new page faults can wait on this fault_*wqh, flush + * the last page faults that may have been already waiting on + * the fault_*wqh. + */ + spin_lock_irq(&ctx->fault_pending_wqh.lock); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + /* Flush pending events that may still wait on event_wqh */ + wake_up_all(&ctx->event_wqh); + + wake_up_poll(&ctx->fd_wqh, EPOLLHUP); + userfaultfd_ctx_put(ctx); + return 0; +} + +/* fault_pending_wqh.lock must be hold by the caller */ +static inline struct userfaultfd_wait_queue *find_userfault_in( + wait_queue_head_t *wqh) +{ + wait_queue_entry_t *wq; + struct userfaultfd_wait_queue *uwq; + + lockdep_assert_held(&wqh->lock); + + uwq = NULL; + if (!waitqueue_active(wqh)) + goto out; + /* walk in reverse to provide FIFO behavior to read userfaults */ + wq = list_last_entry(&wqh->head, typeof(*wq), entry); + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); +out: + return uwq; +} + +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->fault_pending_wqh); +} + +static inline struct userfaultfd_wait_queue *find_userfault_evt( + struct userfaultfd_ctx *ctx) +{ + return find_userfault_in(&ctx->event_wqh); +} + +static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) +{ + struct userfaultfd_ctx *ctx = file->private_data; + __poll_t ret; + + poll_wait(file, &ctx->fd_wqh, wait); + + if (!userfaultfd_is_initialized(ctx)) + return EPOLLERR; + + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return EPOLLERR; + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; +} + +static const struct file_operations userfaultfd_fops; + +static int resolve_userfault_fork(struct userfaultfd_ctx *new, + struct inode *inode, + struct uffd_msg *msg) +{ + int fd; + + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + if (fd < 0) + return fd; + + msg->arg.reserved.reserved1 = 0; + msg->arg.fork.ufd = fd; + return 0; +} + +static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, + struct uffd_msg *msg, struct inode *inode) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + struct userfaultfd_wait_queue *uwq; + /* + * Handling fork event requires sleeping operations, so + * we drop the event_wqh lock, then do these ops, then + * lock it back and wake up the waiter. While the lock is + * dropped the ewq may go away so we keep track of it + * carefully. + */ + LIST_HEAD(fork_event); + struct userfaultfd_ctx *fork_nctx = NULL; + + /* always take the fd_wqh lock before the fault_pending_wqh lock */ + spin_lock_irq(&ctx->fd_wqh.lock); + __add_wait_queue(&ctx->fd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&ctx->fault_pending_wqh.lock); + uwq = find_userfault(ctx); + if (uwq) { + /* + * Use a seqcount to repeat the lockless check + * in wake_userfault() to avoid missing + * wakeups because during the refile both + * waitqueue could become empty if this is the + * only userfault. + */ + write_seqcount_begin(&ctx->refile_seq); + + /* + * The fault_pending_wqh.lock prevents the uwq + * to disappear from under us. + * + * Refile this userfault from + * fault_pending_wqh to fault_wqh, it's not + * pending anymore after we read it. + * + * Use list_del() by hand (as + * userfaultfd_wake_function also uses + * list_del_init() by hand) to be sure nobody + * changes __remove_wait_queue() to use + * list_del_init() in turn breaking the + * !list_empty_careful() check in + * handle_userfault(). The uwq->wq.head list + * must never be empty at any time during the + * refile, or the waitqueue could disappear + * from under us. The "wait_queue_head_t" + * parameter of __remove_wait_queue() is unused + * anyway. + */ + list_del(&uwq->wq.entry); + add_wait_queue(&ctx->fault_wqh, &uwq->wq); + + write_seqcount_end(&ctx->refile_seq); + + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; + spin_unlock(&ctx->fault_pending_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + + spin_lock(&ctx->event_wqh.lock); + uwq = find_userfault_evt(ctx); + if (uwq) { + *msg = uwq->msg; + + if (uwq->msg.event == UFFD_EVENT_FORK) { + fork_nctx = (struct userfaultfd_ctx *) + (unsigned long) + uwq->msg.arg.reserved.reserved1; + list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + + userfaultfd_event_complete(ctx, uwq); + spin_unlock(&ctx->event_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->event_wqh.lock); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (no_wait) { + ret = -EAGAIN; + break; + } + spin_unlock_irq(&ctx->fd_wqh.lock); + schedule(); + spin_lock_irq(&ctx->fd_wqh.lock); + } + __remove_wait_queue(&ctx->fd_wqh, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&ctx->fd_wqh.lock); + + if (!ret && msg->event == UFFD_EVENT_FORK) { + ret = resolve_userfault_fork(fork_nctx, inode, msg); + spin_lock_irq(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); + + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) + userfaultfd_event_complete(ctx, uwq); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); + } + spin_unlock_irq(&ctx->event_wqh.lock); + } + + return ret; +} + +static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct userfaultfd_ctx *ctx = file->private_data; + ssize_t _ret, ret = 0; + struct uffd_msg msg; + struct inode *inode = file_inode(file); + bool no_wait; + + if (!userfaultfd_is_initialized(ctx)) + return -EINVAL; + + no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT; + for (;;) { + if (iov_iter_count(to) < sizeof(msg)) + return ret ? ret : -EINVAL; + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); + if (_ret < 0) + return ret ? ret : _ret; + _ret = !copy_to_iter_full(&msg, sizeof(msg), to); + if (_ret) + return ret ? ret : -EFAULT; + ret += sizeof(msg); + /* + * Allow to read more than one fault at time but only + * block if waiting for the very first one. + */ + no_wait = true; + } +} + +static void __wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + spin_lock_irq(&ctx->fault_pending_wqh.lock); + /* wake all in the range and autoremove */ + if (waitqueue_active(&ctx->fault_pending_wqh)) + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, + range); + if (waitqueue_active(&ctx->fault_wqh)) + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); +} + +static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned seq; + bool need_wakeup; + + /* + * To be sure waitqueue_active() is not reordered by the CPU + * before the pagetable update, use an explicit SMP memory + * barrier here. PT lock release or mmap_read_unlock(mm) still + * have release semantics that can allow the + * waitqueue_active() to be reordered before the pte update. + */ + smp_mb(); + + /* + * Use waitqueue_active because it's very frequent to + * change the address space atomically even if there are no + * userfaults yet. So we take the spinlock only when we're + * sure we've userfaults to wake. + */ + do { + seq = read_seqcount_begin(&ctx->refile_seq); + need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || + waitqueue_active(&ctx->fault_wqh); + cond_resched(); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); + if (need_wakeup) + __wake_userfault(ctx, range); +} + +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) +{ + __u64 task_size = mm->task_size; + + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; + if (start >= task_size) + return -EINVAL; + if (len > task_size - start) + return -EINVAL; + if (start + len <= start) + return -EINVAL; + return 0; +} + +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + +static int userfaultfd_register(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *cur; + int ret; + struct uffdio_register uffdio_register; + struct uffdio_register __user *user_uffdio_register; + vm_flags_t vm_flags; + bool found; + bool basic_ioctls; + unsigned long start, end; + struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); + + user_uffdio_register = (struct uffdio_register __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_register, user_uffdio_register, + sizeof(uffdio_register)-sizeof(__u64))) + goto out; + + ret = -EINVAL; + if (!uffdio_register.mode) + goto out; + if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) + goto out; + vm_flags = 0; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) + vm_flags |= VM_UFFD_MISSING; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (!pgtable_supports_uffd_wp()) + goto out; + + vm_flags |= VM_UFFD_WP; + } + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + goto out; +#endif + vm_flags |= VM_UFFD_MINOR; + } + + ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; + + start = uffdio_register.range.start; + end = start + uffdio_register.range.len; + + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + + ret = -EINVAL; + mmap_write_lock(mm); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) + goto out_unlock; + + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* + * Search for not compatible vmas. + */ + found = false; + basic_ioctls = false; + cur = vma; + do { + cond_resched(); + + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* check not compatible vmas */ + ret = -EINVAL; + if (!vma_can_userfault(cur, vm_flags, wp_async)) + goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + + /* + * If this vma contains ending address, and huge pages + * check alignment. + */ + if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && + end > cur->vm_start) { + unsigned long vma_hpagesize = vma_kernel_pagesize(cur); + + ret = -EINVAL; + + if (end & (vma_hpagesize - 1)) + goto out_unlock; + } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; + + /* + * Check that this vma isn't already owned by a + * different userfaultfd. We can't allow more than one + * userfaultfd to own a single vma simultaneously or we + * wouldn't know which one to deliver the userfaults to. + */ + ret = -EBUSY; + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + /* + * Note vmas containing huge pages + */ + if (is_vm_hugetlb_page(cur)) + basic_ioctls = true; + + found = true; + } for_each_vma_range(vmi, cur, end); + VM_WARN_ON_ONCE(!found); + + ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, + wp_async); + +out_unlock: + mmap_write_unlock(mm); + mmput(mm); + if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + + /* CONTINUE ioctl is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + + /* + * Now that we scanned all vmas we can already tell + * userland which ioctls methods are guaranteed to + * succeed on this range. + */ + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) + ret = -EFAULT; + } +out: + return ret; +} + +static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_range uffdio_unregister; + bool found; + unsigned long start, end, vma_end; + const void __user *buf = (void __user *)arg; + struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); + + ret = -EFAULT; + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + + ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; + + start = uffdio_unregister.start; + end = start + uffdio_unregister.len; + + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + + mmap_write_lock(mm); + ret = -EINVAL; + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) + goto out_unlock; + + /* + * If the first vma contains huge pages, make sure start address + * is aligned to huge page size. + */ + if (is_vm_hugetlb_page(vma)) { + unsigned long vma_hpagesize = vma_kernel_pagesize(vma); + + if (start & (vma_hpagesize - 1)) + goto out_unlock; + } + + /* + * Search for not compatible vmas. + */ + found = false; + cur = vma; + do { + cond_resched(); + + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* + * Prevent unregistering through a different userfaultfd than + * the one used for registration. + */ + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + /* + * Check not compatible vmas, not strictly required + * here as not compatible vmas cannot have an + * userfaultfd_ctx registered on them, but this + * provides for more strict behavior to notice + * unregistration errors. + */ + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) + goto out_unlock; + + found = true; + } for_each_vma_range(vmi, cur, end); + VM_WARN_ON_ONCE(!found); + + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; + + ret = 0; + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + /* VMA not registered with userfaultfd. */ + if (!vma->vm_userfaultfd_ctx.ctx) + goto skip; + + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + if (userfaultfd_missing(vma)) { + /* + * Wake any concurrent pending userfault while + * we unregister, so they will not hang + * permanently and it avoids userland to call + * UFFDIO_WAKE explicitly. + */ + struct userfaultfd_wake_range range; + range.start = start; + range.len = vma_end - start; + wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + start, vma_end); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; + } + +skip: + prev = vma; + start = vma->vm_end; + } + +out_unlock: + mmap_write_unlock(mm); + mmput(mm); +out: + return ret; +} + +/* + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. + */ +static int userfaultfd_wake(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_wake; + struct userfaultfd_wake_range range; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + + range.start = uffdio_wake.start; + range.len = uffdio_wake.len; + + /* + * len == 0 means wake all and we don't want to wake all here, + * so check it again to be sure. + */ + VM_WARN_ON_ONCE(!range.len); + + wake_userfault(ctx, &range); + ret = 0; + +out: + return ret; +} + +static int userfaultfd_copy(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_copy uffdio_copy; + struct uffdio_copy __user *user_uffdio_copy; + struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; + + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, + /* don't copy "copy" last field */ + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, flags); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + if (ret < 0) + goto out; + VM_WARN_ON_ONCE(!ret); + /* len == 0 would wake all */ + range.len = ret; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = uffdio_copy.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_zeropage uffdio_zeropage; + struct uffdio_zeropage __user *user_uffdio_zeropage; + struct userfaultfd_wake_range range; + + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, + /* don't copy "zeropage" last field */ + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; + ret = -EINVAL; + if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { + range.start = uffdio_zeropage.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_continue uffdio_continue; + struct uffdio_continue __user *user_uffdio_continue; + struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; + + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, + /* don't copy the output fields */ + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) + goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, + uffdio_continue.range.len, flags); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { + range.start = uffdio_continue.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + goto out; + } + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, + uffdio_poison.range.len, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON_ONCE(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + +static inline unsigned int uffd_ctx_features(__u64 user_features) +{ + /* + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. + */ + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; +} + +static int userfaultfd_move(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_move uffdio_move; + struct uffdio_move __user *user_uffdio_move; + struct userfaultfd_wake_range range; + struct mm_struct *mm = ctx->mm; + + user_uffdio_move = (struct uffdio_move __user *) arg; + + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + goto out; + } + + if (copy_from_user(&uffdio_move, user_uffdio_move, + /* don't copy "move" last field */ + sizeof(uffdio_move)-sizeof(__s64))) + return -EFAULT; + + /* Do not allow cross-mm moves. */ + if (mm != current->mm) + return -EINVAL; + + ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); + if (ret) + return ret; + + ret = validate_range(mm, uffdio_move.src, uffdio_move.len); + if (ret) + return ret; + + if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| + UFFDIO_MOVE_MODE_DONTWAKE)) + return -EINVAL; + + if (mmget_not_zero(mm)) { + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); + mmput(mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON(!ret); + range.len = ret; + if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { + range.start = uffdio_move.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_move.len ? 0 : -EAGAIN; + +out: + return ret; +} + +/* + * userland asks for a certain API version and we return which bits + * and ioctl commands are implemented in this kernel for such API + * version or -EINVAL if unknown. + */ +static int userfaultfd_api(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct uffdio_api uffdio_api; + void __user *buf = (void __user *)arg; + unsigned int ctx_features; + int ret; + __u64 features; + + ret = -EFAULT; + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) + goto out; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API) + goto err_out; + ret = -EPERM; + if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) + goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + + /* report all available features and ioctls to userland */ + uffdio_api.features = UFFD_API_FEATURES; +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); +#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } + + ret = -EINVAL; + if (features & ~uffdio_api.features) + goto err_out; + + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + + /* only enable the requested features for this uffd context */ + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + + ret = 0; +out: + return ret; +err_out: + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + ret = -EFAULT; + goto out; +} + +static long userfaultfd_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct userfaultfd_ctx *ctx = file->private_data; + + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) + return -EINVAL; + + switch (cmd) { + case UFFDIO_API: + ret = userfaultfd_api(ctx, arg); + break; + case UFFDIO_REGISTER: + ret = userfaultfd_register(ctx, arg); + break; + case UFFDIO_UNREGISTER: + ret = userfaultfd_unregister(ctx, arg); + break; + case UFFDIO_WAKE: + ret = userfaultfd_wake(ctx, arg); + break; + case UFFDIO_COPY: + ret = userfaultfd_copy(ctx, arg); + break; + case UFFDIO_ZEROPAGE: + ret = userfaultfd_zeropage(ctx, arg); + break; + case UFFDIO_MOVE: + ret = userfaultfd_move(ctx, arg); + break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; + case UFFDIO_CONTINUE: + ret = userfaultfd_continue(ctx, arg); + break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; + } + return ret; +} + +#ifdef CONFIG_PROC_FS +static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct userfaultfd_ctx *ctx = f->private_data; + wait_queue_entry_t *wq; + unsigned long pending = 0, total = 0; + + spin_lock_irq(&ctx->fault_pending_wqh.lock); + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { + pending++; + total++; + } + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { + total++; + } + spin_unlock_irq(&ctx->fault_pending_wqh.lock); + + /* + * If more protocols will be added, there will be all shown + * separated by a space. Like this: + * protocols: aa:... bb:... + */ + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", + pending, total, UFFD_API, ctx->features, + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); +} +#endif + +static const struct file_operations userfaultfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = userfaultfd_show_fdinfo, +#endif + .release = userfaultfd_release, + .poll = userfaultfd_poll, + .read_iter = userfaultfd_read_iter, + .unlocked_ioctl = userfaultfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static void init_once_userfaultfd_ctx(void *mem) +{ + struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; + + init_waitqueue_head(&ctx->fault_pending_wqh); + init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->event_wqh); + init_waitqueue_head(&ctx->fd_wqh); + seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); +} + +static int new_userfaultfd(int flags) +{ + struct userfaultfd_ctx *ctx __free(kfree) = NULL; + + VM_WARN_ON_ONCE(!current->mm); + + /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); + + if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) + return -EINVAL; + + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refcount, 1); + ctx->flags = flags; + ctx->features = 0; + ctx->released = false; + init_rwsem(&ctx->map_changing_lock); + atomic_set(&ctx->mmap_changing, 0); + ctx->mm = current->mm; + + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; + + /* prevent the mm struct to be freed */ + mmgrab(ctx->mm); + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); +} + +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + +static int __init userfaultfd_init(void) +{ + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", + sizeof(struct userfaultfd_ctx), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + init_once_userfaultfd_ctx); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_userfaultfd_table); +#endif + return 0; +} +__initcall(userfaultfd_init); diff --git a/mm/util.c b/mm/util.c index 3cc949a0b7ed..af2c2103f0d9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1192,6 +1192,7 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, desc->vm_file = vma->vm_file; desc->vma_flags = vma->flags; desc->page_prot = vma->vm_page_prot; + desc->vm_ops = vma->vm_ops; /* Default. */ desc->action.type = MMAP_NOTHING; @@ -1396,8 +1397,6 @@ static int mmap_action_finish(struct vm_area_struct *vma, if (!err) err = call_vma_mapped(vma); - if (!err && action->success_hook) - err = action->success_hook(vma); /* do_munmap() might take rmap lock, so release if held. */ maybe_rmap_unlock_action(vma, action); @@ -1415,16 +1414,22 @@ static int mmap_action_finish(struct vm_area_struct *vma, */ len = vma_pages(vma) << PAGE_SHIFT; do_munmap(current->mm, vma->vm_start, len, NULL); - if (action->error_hook) { - /* We may want to filter the error. */ - err = action->error_hook(err); - /* The caller should not clear the error. */ - VM_WARN_ON_ONCE(!err); - } - return err; + + return action->error_override ?: err; } #ifdef CONFIG_MMU + +static int check_mmap_action(struct mmap_action *action) +{ + const unsigned long override = action->error_override; + + if (WARN_ON_ONCE(override && !IS_ERR_VALUE(override))) + return -EINVAL; + + return 0; +} + /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. @@ -1434,7 +1439,14 @@ static int mmap_action_finish(struct vm_area_struct *vma, */ int mmap_action_prepare(struct vm_area_desc *desc) { - switch (desc->action.type) { + struct mmap_action *action = &desc->action; + int err; + + err = check_mmap_action(action); + if (err) + return err; + + switch (action->type) { case MMAP_NOTHING: return 0; case MMAP_REMAP_PFN: @@ -2697,6 +2697,8 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma, { if (map->vm_ops) vma->vm_ops = map->vm_ops; + else /* Only /dev/zero should do this. */ + vma_set_anonymous(vma); vma->vm_private_data = map->vm_private_data; } @@ -2744,6 +2746,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, .action = { .type = MMAP_NOTHING, /* Default to no further action. */ }, + .vm_ops = &vma_dummy_vm_ops, }; bool allocated_new = false; int error; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bb6ae08d18f5..1afca3568b9b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +/* + * vm_area_free_pages - free a range of pages from a vmalloc allocation + * @vm: the vm_struct containing the pages + * @start_idx: first page index to free (inclusive) + * @end_idx: last page index to free (exclusive) + * + * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting. + * Freed vm->pages[] entries are set to NULL. + * Caller is responsible for unmapping (vunmap_range) and KASAN + * poisoning before calling this. + */ +static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx, + unsigned int end_idx) +{ + unsigned int i; + + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = start_idx; i < end_idx; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); + } + free_pages_bulk(vm->pages + start_idx, end_idx - start_idx); + + for (i = start_idx; i < end_idx; i++) + vm->pages[i] = NULL; +} + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr) void vfree(const void *addr) { struct vm_struct *vm; - int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); @@ -3459,19 +3484,8 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - for (i = 0; i < vm->nr_pages; i++) { - struct page *page = vm->pages[i]; - BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_lruvec_page_state(page, NR_VMALLOC, -1); - __free_page(page); - cond_resched(); - } + vm_area_free_pages(vm, 0, vm->nr_pages); kvfree(vm->pages); kfree(vm); } @@ -3939,7 +3953,7 @@ fail: __GFP_NOFAIL | __GFP_ZERO |\ __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ - GFP_USER | __GFP_NOLOCKDEP) + GFP_USER | __GFP_NOLOCKDEP | __GFP_SKIP_KASAN) static gfp_t vmalloc_fix_flags(gfp_t flags) { @@ -3980,6 +3994,9 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * * %__GFP_NOWARN can be used to suppress failure messages. * + * %__GFP_SKIP_KASAN can be used to skip unpoisoning of mapped pages + * (when prot=%PAGE_KERNEL). + * * Can not be called from interrupt nor NMI contexts. * Return: the address of the area or %NULL on failure */ @@ -3993,6 +4010,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long original_align = align; unsigned int shift = PAGE_SHIFT; + bool skip_vmalloc_kasan = kasan_hw_tags_enabled() && (gfp_mask & __GFP_SKIP_KASAN); if (WARN_ON_ONCE(!size)) return NULL; @@ -4023,7 +4041,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, again: area = __get_vm_area_node(size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, - gfp_mask, caller); + gfp_mask & ~__GFP_SKIP_KASAN, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, @@ -4041,7 +4059,7 @@ again: * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { - if (kasan_hw_tags_enabled()) { + if (kasan_hw_tags_enabled() && !skip_vmalloc_kasan) { /* * Modify protection bits to allow tagging. * This must be done before mapping. @@ -4078,7 +4096,8 @@ again: (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ - area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); + if (!skip_vmalloc_kasan) + area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -4324,16 +4343,70 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && nid != page_to_nid(vmalloc_to_page(p))) goto need_realloc; + } else { + /* + * If p is NULL, vrealloc behaves exactly like vmalloc. + * Skip the shrink and in-place grow paths. + */ + goto need_realloc; } - /* - * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What - * would be a good heuristic for when to shrink the vm_area? - */ if (size <= old_size) { + unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + /* Zero out "freed" memory, potentially for future realloc. */ if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + + /* + * Free tail pages when shrink crosses a page boundary. + * + * Skip huge page allocations (page_order > 0) as partial + * freeing would require splitting. + * + * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must + * be reset before pages are returned to the allocator. + * + * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates + * mapping requests against the unchanged vm->size; freeing + * tail pages would cause vmalloc_to_page() to return NULL for + * the unmapped range. + * + * Skip if either GFP_NOFS or GFP_NOIO are used. + * kmemleak_free_part() internally allocates with + * GFP_KERNEL, which could trigger a recursive deadlock + * if we are under filesystem or I/O reclaim. + */ + if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) && + !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) && + gfp_has_io_fs(flags)) { + unsigned long addr = (unsigned long)kasan_reset_tag(p); + unsigned int old_nr_pages = vm->nr_pages; + + /* + * Use the node lock to synchronize with concurrent + * readers (vmalloc_info_show). + */ + struct vmap_node *vn = addr_to_node(addr); + + spin_lock(&vn->busy.lock); + vm->nr_pages = new_nr_pages; + spin_unlock(&vn->busy.lock); + + /* Notify kmemleak of the reduced allocation size before unmapping. */ + kmemleak_free_part( + (void *)addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + (unsigned long)(old_nr_pages - new_nr_pages) + << PAGE_SHIFT); + + vunmap_range(addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + addr + ((unsigned long)old_nr_pages + << PAGE_SHIFT)); + + vm_area_free_pages(vm, new_nr_pages, old_nr_pages); + } vm->requested_size = size; kasan_vrealloc(p, old_size, size); return (void *)p; @@ -4342,7 +4415,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align /* * We already have the bytes available in the allocation; use them. */ - if (size <= alloced_size) { + if (size <= vm->nr_pages << PAGE_SHIFT) { /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during @@ -4641,7 +4714,18 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) smp_rmb(); vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); + if (vm) + /* + * For VM_ALLOC areas, use nr_pages rather than + * get_vm_area_size() because vrealloc() may shrink + * the mapping without updating area->size. Other + * mapping types (vmap, ioremap) don't set nr_pages. + */ + size = (vm->flags & VM_ALLOC && vm->nr_pages) ? + (vm->nr_pages << PAGE_SHIFT) : + get_vm_area_size(vm); + else + size = va_size(va); if (addr >= vaddr + size) goto next_va; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 3fbb86996c4d..f053554e5826 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -218,6 +218,7 @@ static void vmpressure_work_fn(struct work_struct *work) /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask + * @order: allocation order being reclaimed for * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned @@ -236,7 +237,7 @@ static void vmpressure_work_fn(struct work_struct *work) * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; @@ -307,7 +308,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); - if (level > VMPRESSURE_LOW) { + /* + * Once we go above COSTLY_ORDER, reclaim relies heavily on + * compaction to make progress. Reclaim efficiency was never a + * great proxy for pressure to begin with, but it's outright + * misleading with these high orders. Don't throttle sockets + * because somebody is attempting something crazy like an order-7 + * and predictably struggling. + */ + if (level > VMPRESSURE_LOW && order <= PAGE_ALLOC_COSTLY_ORDER) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. @@ -348,7 +357,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + vmpressure(gfp, 0, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) diff --git a/mm/vmscan.c b/mm/vmscan.c index 67231d3189ef..b21a15f36cce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -109,7 +109,7 @@ struct scan_control { /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ + /* zone_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -169,11 +169,9 @@ struct scan_control { struct { unsigned int dirty; - unsigned int unqueued_dirty; unsigned int congested; unsigned int writeback; unsigned int immediate; - unsigned int file_taken; unsigned int taken; } nr; @@ -739,7 +737,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - memcg1_swapout(folio, swap); + __memcg1_swapout(folio, ci); __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); } else { @@ -850,7 +848,11 @@ static bool lru_gen_set_refs(struct folio *folio) return false; } - set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + /* Promote on second access */ + if (folio_lru_refs(folio) > 1) + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + else + folio_mark_accessed(folio); return true; } #else @@ -1944,6 +1946,44 @@ static int current_may_throttle(void) return !(current->flags & PF_LOCAL_THROTTLE); } +static void handle_reclaim_writeback(unsigned long nr_taken, + struct pglist_data *pgdat, + struct scan_control *sc, + struct reclaim_stat *stat) +{ + /* + * If dirty folios are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty folios to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty folios grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ + if (stat->nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + + sc->nr.dirty += stat->nr_dirty; + sc->nr.congested += stat->nr_congested; + sc->nr.writeback += stat->nr_writeback; + sc->nr.immediate += stat->nr_immediate; + sc->nr.taken += nr_taken; +} + /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -2007,42 +2047,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - - /* - * If dirty folios are scanned that are not queued for IO, it - * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty folios to the end of - * the LRU before the dirty limits are breached and the dirty - * data has expired. It can also happen when the proportion of - * dirty folios grows not through writes but through memory - * pressure reclaiming all the clean cache. And in some cases, - * the flushers simply cannot keep up with the allocation - * rate. Nudge the flusher threads in case they are asleep. - */ - if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * For cgroupv1 dirty throttling is achieved by waking up - * the kernel flusher here and later waiting on folios - * which are in writeback to finish (see shrink_folio_list()). - * - * Flusher may not be able to issue writeback quickly - * enough for cgroupv1 writeback throttling to work - * on a large system. - */ - if (!writeback_throttling_sane(sc)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); - } - - sc->nr.dirty += stat.nr_dirty; - sc->nr.congested += stat.nr_congested; - sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; - sc->nr.writeback += stat.nr_writeback; - sc->nr.immediate += stat.nr_immediate; - sc->nr.taken += nr_taken; - if (file) - sc->nr.file_taken += nr_taken; - + handle_reclaim_writeback(nr_taken, pgdat, sc, &stat); trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; @@ -3220,7 +3225,7 @@ static int folio_update_gen(struct folio *folio, int gen) } /* protect pages accessed multiple times through file descriptors */ -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio) { int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; @@ -3239,9 +3244,6 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; - /* for folio_end_writeback() */ - if (reclaiming) - new_flags |= BIT(PG_reclaim); } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -3855,7 +3857,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); - new_gen = folio_inc_gen(lruvec, folio, false); + new_gen = folio_inc_gen(lruvec, folio); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); /* don't count the workingset being lazily promoted */ @@ -3878,10 +3880,9 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) +static void try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; - bool success = false; bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); @@ -3907,11 +3908,10 @@ next: /* * If min_seq[type] of both anonymous and file is not increased, - * we can directly return false to avoid unnecessary checking - * overhead later. + * return here to avoid unnecessary checking overhead later. */ if (!seq_inc_flag) - return success; + return; /* see the comment on lru_gen_folio */ if (swappiness && swappiness <= MAX_SWAPPINESS) { @@ -3929,10 +3929,7 @@ next: reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); - success = true; } - - return success; } static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) @@ -4084,27 +4081,33 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); } -static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +static unsigned long lruvec_evictable_size(struct lruvec *lruvec, int swappiness) { int gen, type, zone; - unsigned long total = 0; - int swappiness = get_swappiness(lruvec, sc); + unsigned long seq, total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); for_each_evictable_type(type, swappiness) { - unsigned long seq; - for (seq = min_seq[type]; seq <= max_seq; seq++) { gen = lru_gen_from_seq(seq); - for (zone = 0; zone < MAX_NR_ZONES; zone++) total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } + return total; +} + +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long total; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + total = lruvec_evictable_size(lruvec, swappiness); + /* whether the size is big enough to be helpful */ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; } @@ -4577,7 +4580,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int tier_idx) { bool success; - bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -4607,7 +4609,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c /* protected */ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { - gen = folio_inc_gen(lruvec, folio, false); + gen = folio_inc_gen(lruvec, folio); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); /* don't count the workingset being lazily promoted */ @@ -4622,26 +4624,11 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c /* ineligible */ if (zone > sc->reclaim_idx) { - gen = folio_inc_gen(lruvec, folio, false); + gen = folio_inc_gen(lruvec, folio); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } - dirty = folio_test_dirty(folio); - writeback = folio_test_writeback(folio); - if (type == LRU_GEN_FILE && dirty) { - sc->nr.file_taken += delta; - if (!writeback) - sc->nr.unqueued_dirty += delta; - } - - /* waiting for writeback */ - if (writeback || (type == LRU_GEN_FILE && dirty)) { - gen = folio_inc_gen(lruvec, folio, true); - list_move(&folio->lru, &lrugen->folios[gen][type][zone]); - return true; - } - return false; } @@ -4649,12 +4636,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* swap constrained */ - if (!(sc->gfp_mask & __GFP_IO) && - (folio_test_dirty(folio) || - (folio_test_anon(folio) && !folio_test_swapcache(folio)))) - return false; - /* raced with release_pages() */ if (!folio_try_get(folio)) return false; @@ -4669,9 +4650,6 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); - /* for shrink_folio_list() */ - folio_clear_reclaim(folio); - success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4680,7 +4658,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int type, int tier, - struct list_head *list) + struct list_head *list, int *isolatedp) { int i; int gen; @@ -4689,10 +4667,10 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, int scanned = 0; int isolated = 0; int skipped = 0; - int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); - int remaining = scan_batch; + unsigned long remaining = nr_to_scan; struct lru_gen_folio *lrugen = &lruvec->lrugen; + VM_WARN_ON_ONCE(nr_to_scan > MAX_LRU_BATCH); VM_WARN_ON_ONCE(!list_empty(list)); if (get_nr_gens(lruvec, type) == MIN_NR_GENS) @@ -4745,16 +4723,12 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, mod_lruvec_state(lruvec, item, isolated); mod_lruvec_state(lruvec, PGREFILL, sorted); mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated); - trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - if (type == LRU_GEN_FILE) - sc->nr.file_taken += isolated; - /* - * There might not be eligible folios due to reclaim_idx. Check the - * remaining to prevent livelock if it's not making progress. - */ - return isolated || !remaining ? scanned : 0; + + *isolatedp = isolated; + return scanned; } static int get_tier_idx(struct lruvec *lruvec, int type) @@ -4798,33 +4772,41 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness) static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness, - int *type_scanned, struct list_head *list) + struct list_head *list, int *isolated, + int *isolate_type, int *isolate_scanned) { int i; + int total_scanned = 0; int type = get_type_to_scan(lruvec, swappiness); for_each_evictable_type(i, swappiness) { int scanned; int tier = get_tier_idx(lruvec, type); - *type_scanned = type; - - scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); - if (scanned) - return scanned; + scanned = scan_folios(nr_to_scan, lruvec, sc, + type, tier, list, isolated); - type = !type; + total_scanned += scanned; + if (*isolated) { + *isolate_type = type; + *isolate_scanned = scanned; + break; + } + /* + * If scanned > 0 and isolated == 0, avoid falling back to the + * other type, as this type remains sufficient. Falling back + * too readily can disrupt the positive_ctrl_err() bias. + */ + if (!scanned) + type = !type; } - return 0; + return total_scanned; } static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, int swappiness) { - int type; - int scanned; - int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); struct folio *folio; @@ -4832,19 +4814,23 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, enum node_stat_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; + int scanned, reclaimed; + int isolated = 0, type, type_scanned; bool skip_retry = false; - struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lruvec_lock_irq(lruvec); - scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); + /* In case folio deletion left empty old gens, flush them */ + try_to_inc_min_seq(lruvec, swappiness); - scanned += try_to_inc_min_seq(lruvec, swappiness); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, + &list, &isolated, &type, &type_scanned); - if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) - scanned = 0; + /* Scanning may have emptied the oldest gen, flush it */ + if (scanned) + try_to_inc_min_seq(lruvec, swappiness); lruvec_unlock_irq(lruvec); @@ -4852,10 +4838,12 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, return scanned; retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); - sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; + /* Retry pass is only meant for clean folios without new isolation */ + if (isolated) + handle_reclaim_writeback(isolated, pgdat, sc, &stat); trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - scanned, reclaimed, &stat, sc->priority, + type_scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { @@ -4900,6 +4888,7 @@ retry: if (!list_empty(&list)) { skip_retry = true; + isolated = 0; goto retry; } @@ -4907,63 +4896,37 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - int swappiness, unsigned long *nr_to_scan) + struct scan_control *sc, int swappiness) { - int gen, type, zone; - unsigned long size = 0; - struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - *nr_to_scan = 0; /* have to run aging, since eviction is not possible anymore */ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - for_each_evictable_type(type, swappiness) { - unsigned long seq; - - for (seq = min_seq[type]; seq <= max_seq; seq++) { - gen = lru_gen_from_seq(seq); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - } - } + /* try to avoid aging, do gentle reclaim at the default priority */ + if (sc->priority == DEF_PRIORITY) + return false; - *nr_to_scan = size; /* better to run aging even though eviction is still possible */ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } -/* - * For future optimizations: - * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg - * reclaim. - */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + struct mem_cgroup *memcg, int swappiness) { - bool success; - unsigned long nr_to_scan; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); - - if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return -1; + unsigned long nr_to_scan, evictable; - success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); + evictable = lruvec_evictable_size(lruvec, swappiness); /* try to scrape all its memory if this memcg was deleted */ - if (nr_to_scan && !mem_cgroup_online(memcg)) - return nr_to_scan; + if (!mem_cgroup_online(memcg)) + return evictable; - nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + nr_to_scan = apply_proportional_protection(memcg, sc, evictable); + nr_to_scan >>= sc->priority; - /* try to get away with not aging at the default priority */ - if (!success || sc->priority == DEF_PRIORITY) - return nr_to_scan >> sc->priority; - - /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; + return nr_to_scan; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -4993,62 +4956,59 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) return true; } +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - long nr_to_scan; - unsigned long scanned = 0; + bool need_rotate = false, should_age = false; + long nr_batch, nr_to_scan; int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); - while (true) { + nr_to_scan = get_nr_to_scan(lruvec, sc, memcg, swappiness); + while (nr_to_scan > 0) { int delta; + DEFINE_MAX_SEQ(lruvec); - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); - if (nr_to_scan <= 0) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) { + need_rotate = true; break; + } - delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); - if (!delta) - break; + if (should_run_aging(lruvec, max_seq, sc, swappiness)) { + if (try_to_inc_max_seq(lruvec, max_seq, swappiness, false)) + need_rotate = true; + should_age = true; + } - scanned += delta; - if (scanned >= nr_to_scan) + nr_batch = min(nr_to_scan, MIN_LRU_BATCH); + delta = evict_folios(nr_batch, lruvec, sc, swappiness); + if (!delta) break; if (should_abort_scan(lruvec, sc)) break; - cond_resched(); - } - - /* - * If too many file cache in the coldest generation can't be evicted - * due to being dirty, wake up the flusher. - */ - if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * For cgroupv1 dirty throttling is achieved by waking up - * the kernel flusher here and later waiting on folios - * which are in writeback to finish (see shrink_folio_list()). - * - * Flusher may not be able to issue writeback quickly - * enough for cgroupv1 writeback throttling to work - * on a large system. + * Root reclaim needs rotation when low on cold folio for better + * fairness. Cgroup reclaim gets fairness from the iterator. */ - if (!writeback_throttling_sane(sc)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + if (root_reclaim(sc) && should_age) + break; + + nr_to_scan -= delta; + cond_resched(); } - /* whether this lruvec should be rotated */ - return nr_to_scan < 0; + return need_rotate; } static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) { - bool success; + bool need_rotate; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5066,20 +5026,20 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } - success = try_to_shrink_lruvec(lruvec, sc); + need_rotate = try_to_shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + vmpressure(sc->gfp_mask, sc->order, memcg, false, + sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); - if (success && mem_cgroup_online(memcg)) + if (need_rotate && mem_cgroup_online(memcg)) return MEMCG_LRU_YOUNG; - if (!success && lruvec_is_sizable(lruvec, sc)) + if (!need_rotate && lruvec_is_sizable(lruvec, sc)) return 0; /* one retry if offlined or too small */ @@ -5631,6 +5591,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { + int nr_batch; DEFINE_MAX_SEQ(lruvec); if (seq + MIN_NR_GENS > max_seq) @@ -5647,8 +5608,8 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, - swappiness)) + nr_batch = min(nr_to_reclaim - sc->nr_reclaimed, MAX_LRU_BATCH); + if (!evict_folios(nr_batch, lruvec, sc, swappiness)) return 0; cond_resched(); @@ -6175,7 +6136,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) /* Record the group's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, + vmpressure(sc->gfp_mask, sc->order, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); @@ -6220,7 +6181,7 @@ again: /* Record the subtree's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + vmpressure(sc->gfp_mask, sc->order, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, nr_node_reclaimed); if (nr_node_reclaimed) @@ -6359,7 +6320,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) if (current_is_kswapd() || cgroup_reclaim(sc)) return; - /* Throttle if making no progress at high prioities. */ + /* Throttle if making no progress at high priorities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } @@ -7053,7 +7014,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, /* * Fragmentation may mean that the system cannot be rebalanced for - * high-order allocations. If twice the allocation size has been + * high-order allocations. If at least the compaction gap has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. @@ -7121,6 +7082,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) .may_unmap = 1, }; + trace_mm_vmscan_balance_pgdat_begin(pgdat->node_id, order, + highest_zoneidx); set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); @@ -7222,7 +7185,7 @@ restart: /* * There should be no need to raise the scanning priority if - * enough pages are already being scanned that that high + * enough pages are already being scanned that the high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) @@ -7314,6 +7277,9 @@ out: psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); + trace_mm_vmscan_balance_pgdat_end(pgdat->node_id, sc.order, + highest_zoneidx, sc.nr_reclaimed); + /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller diff --git a/mm/workingset.c b/mm/workingset.c index 07e6836d0502..f351798e723a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -319,11 +319,13 @@ static void lru_gen_refault(struct folio *folio, void *shadow) atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - /* see folio_add_lru() where folio_set_active() will be called */ - if (lru_gen_in_fault()) - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - if (workingset) { + /* + * see folio_add_lru(), where folio_set_active() is + * called for workingset folios + */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else diff --git a/mm/zswap.c b/mm/zswap.c index 4b5149173b0e..761cd699e0a3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -991,7 +991,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; - bool folio_was_allocated; struct swap_info_struct *si; int ret = 0; @@ -1001,23 +1000,19 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -EEXIST; mpol = get_task_policy(current); - folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated); + folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol, + NO_INTERLEAVE_INDEX); put_swap_device(si); - if (!folio) - return -ENOMEM; /* - * Found an existing folio, we raced with swapin or concurrent - * shrinker. We generally writeback cold folios from zswap, and - * swapin means the folio just became hot, so skip this folio. - * For unlikely concurrent shrinker case, it will be unlinked - * and freed when invalidated by the concurrent shrinker anyway. + * Swap cache allocation might fail due to OOM, or the entry + * may already be cached due to concurrent swapin or have been + * freed. If already cached, a concurrent swapin made the folio + * hot, so skip it. For the unlikely concurrent shrinker case, + * it will be unlinked and freed when invalidated anyway. */ - if (!folio_was_allocated) { - ret = -EEXIST; - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * folio is locked, and the swapcache is now secured against @@ -1057,7 +1052,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, __swap_writepage(folio, NULL); out: - if (ret && ret != -EEXIST) { + if (ret) { swap_cache_del_folio(folio); folio_unlock(folio); } |
