diff options
Diffstat (limited to 'mm')
47 files changed, 2088 insertions, 1258 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0a43bb80df4f..e8bf1e9e6ad9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -570,6 +570,7 @@ config SPLIT_PTE_PTLOCKS depends on !ARM || CPU_CACHE_VIPT depends on !PARISC || PA20 depends on !SPARC32 + depends on !UML config ARCH_ENABLE_SPLIT_PMD_PTLOCK bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 7638d75b27db..91b3e027b753 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -297,6 +297,17 @@ config DEBUG_KMEMLEAK_AUTO_SCAN If unsure, say Y. +config DEBUG_KMEMLEAK_VERBOSE + bool "Default kmemleak to verbose mode" + depends on DEBUG_KMEMLEAK_AUTO_SCAN + help + Say Y here to have kmemleak print unreferenced object details + (backtrace, hex dump, address) to dmesg when new memory leaks are + detected during automatic scanning. This can also be toggled at + runtime via /sys/module/kmemleak/parameters/verbose. + + If unsure, say N. + config PER_VMA_LOCK_STATS bool "Statistics for per-vma locks" depends on PER_VMA_LOCK diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7a18fa6c7272..cecbcf9060a6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -618,12 +618,13 @@ static void cgwb_release_workfn(struct work_struct *work) wb_shutdown(wb); css_put(wb->memcg_css); - css_put(wb->blkcg_css); - mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); + css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); + fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); diff --git a/mm/compaction.c b/mm/compaction.c index 1e8f8eca318c..3648ce22c807 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +static struct lruvec * +compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, + struct compact_control *cc) +{ + struct lruvec *lruvec; + + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } + + return lruvec; +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. The lock should be periodically unlocked to avoid @@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; struct lruvec *locked = NULL; struct folio *folio = NULL; @@ -913,7 +931,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!(low_pfn % COMPACT_CLUSTER_MAX)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -964,7 +982,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* for alloc_contig case */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1053,7 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(page_has_movable_ops(page)) && !PageMovableOpsIsolated(page)) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } @@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(folio); + if (locked) + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ - if (lruvec != locked) { + if (lruvec != locked || !locked) { if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, folio); - /* * Try get exclusive access under lock. If marked for * skip, the scan is aborted unless the current context @@ -1226,7 +1243,7 @@ isolate_success_no_list: isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } folio_put(folio); @@ -1242,7 +1259,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); locked = NULL; } putback_movable_pages(&cc->migratepages); @@ -1274,7 +1291,7 @@ isolate_fail: isolate_abort: if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + lruvec_unlock_irqrestore(locked, flags); if (folio) { folio_set_lru(folio); folio_put(folio); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7f04fc3f8c8c..3dbbbfdeff71 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1477,6 +1477,11 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) int i; int err = 0; + for (i = 0; i < nr_ctxs; i++) { + if (!is_power_of_2(ctxs[i]->min_region_sz)) + return -EINVAL; + } + mutex_lock(&damon_lock); if ((exclusive && nr_running_ctxs) || (!exclusive && running_exclusive_ctxs)) { @@ -1573,35 +1578,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx) return pid; } -/* - * damon_call_handle_inactive_ctx() - handle DAMON call request that added to - * an inactive context. - * @ctx: The inactive DAMON context. - * @control: Control variable of the call request. - * - * This function is called in a case that @control is added to @ctx but @ctx is - * not running (inactive). See if @ctx handled @control or not, and cleanup - * @control if it was not handled. - * - * Returns 0 if @control was handled by @ctx, negative error code otherwise. - */ -static int damon_call_handle_inactive_ctx( - struct damon_ctx *ctx, struct damon_call_control *control) -{ - struct damon_call_control *c; - - mutex_lock(&ctx->call_controls_lock); - list_for_each_entry(c, &ctx->call_controls, list) { - if (c == control) { - list_del(&control->list); - mutex_unlock(&ctx->call_controls_lock); - return -EINVAL; - } - } - mutex_unlock(&ctx->call_controls_lock); - return 0; -} - /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1619,6 +1595,10 @@ static int damon_call_handle_inactive_ctx( * synchronization. The return value of the function will be saved in * &damon_call_control->return_code. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) @@ -1629,10 +1609,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) INIT_LIST_HEAD(&control->list); mutex_lock(&ctx->call_controls_lock); + if (ctx->call_controls_obsolete) { + mutex_unlock(&ctx->call_controls_lock); + return -ECANCELED; + } list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); - if (!damon_is_running(ctx)) - return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -1660,6 +1642,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) * passed at least one &damos->apply_interval_us, kdamond marks the request as * completed so that damos_walk() can wakeup and return. * + * Note that this function should be called only after damon_start() with the + * @ctx has succeeded. Otherwise, this function could fall into an indefinite + * wait. + * * Return: 0 on success, negative error code otherwise. */ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) @@ -1667,19 +1653,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) init_completion(&control->completion); control->canceled = false; mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control_obsolete) { + mutex_unlock(&ctx->walk_control_lock); + return -ECANCELED; + } if (ctx->walk_control) { mutex_unlock(&ctx->walk_control_lock); return -EBUSY; } ctx->walk_control = control; mutex_unlock(&ctx->walk_control_lock); - if (!damon_is_running(ctx)) { - mutex_lock(&ctx->walk_control_lock); - if (ctx->walk_control == control) - ctx->walk_control = NULL; - mutex_unlock(&ctx->walk_control_lock); - return -EINVAL; - } wait_for_completion(&control->completion); if (control->canceled) return -ECANCELED; @@ -2239,12 +2222,24 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ #ifdef CONFIG_NUMA +static bool invalid_mem_node(int nid) +{ + return nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY); +} + static __kernel_ulong_t damos_get_node_mem_bp( struct damos_quota_goal *goal) { struct sysinfo i; __kernel_ulong_t numerator; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + return 10000; + } + si_meminfo_node(&i, goal->nid); if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) numerator = i.totalram - i.freeram; @@ -2261,6 +2256,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; + if (invalid_mem_node(goal->nid)) { + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + return 0; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + return 10000; + } + memcg = mem_cgroup_get_from_id(goal->memcg_id); if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) @@ -2387,7 +2389,8 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ -static void damos_set_effective_quota(struct damos_quota *quota) +static void damos_set_effective_quota(struct damos_quota *quota, + struct damon_ctx *ctx) { unsigned long throughput; unsigned long esz = ULONG_MAX; @@ -2412,6 +2415,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); + esz = max(ctx->min_region_sz, esz); } if (quota->sz && quota->sz < esz) @@ -2448,11 +2452,12 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* First charge window */ if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); } /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + + if (!time_in_range_open(jiffies, quota->charged_from, + quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { if (damos_quota_is_set(quota) && quota->charged_sz >= quota->esz) @@ -2462,7 +2467,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->charged_sz = 0; if (trace_damos_esz_enabled()) cached_esz = quota->esz; - damos_set_effective_quota(quota); + damos_set_effective_quota(quota, c); if (trace_damos_esz_enabled() && quota->esz != cached_esz) damos_trace_esz(c, s, quota); } @@ -2952,6 +2957,12 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = false; + mutex_unlock(&ctx->call_controls_lock); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = false; + mutex_unlock(&ctx->walk_control_lock); complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); @@ -3062,7 +3073,13 @@ done: damon_destroy_targets(ctx); kfree(ctx->regions_score_histogram); + mutex_lock(&ctx->call_controls_lock); + ctx->call_controls_obsolete = true; + mutex_unlock(&ctx->call_controls_lock); kdamond_call(ctx, true); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control_obsolete = true; + mutex_unlock(&ctx->walk_control_lock); damos_walk_cancel(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 554559d72976..8494040b1ee4 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -161,15 +161,6 @@ module_param(monitor_region_end, ulong, 0600); */ static unsigned long addr_unit __read_mostly = 1; -/* - * PID of the DAMON thread - * - * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. - * Else, -1. - */ -static int kdamond_pid __read_mostly = -1; -module_param(kdamond_pid, int, 0400); - static struct damos_stat damon_lru_sort_hot_stat; DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, lru_sort_tried_hot_regions, lru_sorted_hot_regions, @@ -386,12 +377,8 @@ static int damon_lru_sort_turn(bool on) { int err; - if (!on) { - err = damon_stop(&ctx, 1); - if (!err) - kdamond_pid = -1; - return err; - } + if (!on) + return damon_stop(&ctx, 1); err = damon_lru_sort_apply_parameters(); if (err) @@ -400,9 +387,6 @@ static int damon_lru_sort_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = damon_kdamond_pid(ctx); - if (kdamond_pid < 0) - return kdamond_pid; return damon_call(ctx, &call_control); } @@ -430,42 +414,83 @@ module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); MODULE_PARM_DESC(addr_unit, "Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)"); +static bool damon_lru_sort_enabled(void) +{ + if (!ctx) + return false; + return damon_is_running(ctx); +} + static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { - bool is_enabled = enabled; - bool enable; int err; - err = kstrtobool(val, &enable); + err = kstrtobool(val, &enabled); if (err) return err; - if (is_enabled == enable) + if (damon_lru_sort_enabled() == enabled) return 0; /* Called before init function. The function will handle this. */ if (!damon_initialized()) - goto set_param_out; + return 0; - err = damon_lru_sort_turn(enable); - if (err) - return err; + return damon_lru_sort_turn(enabled); +} -set_param_out: - enabled = enable; - return err; +static int damon_lru_sort_enabled_load(char *buffer, + const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", damon_lru_sort_enabled() ? 'Y' : 'N'); } static const struct kernel_param_ops enabled_param_ops = { .set = damon_lru_sort_enabled_store, - .get = param_get_bool, + .get = damon_lru_sort_enabled_load, }; module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_LRU_SORT (default: disabled)"); +static int damon_lru_sort_kdamond_pid_store(const char *val, + const struct kernel_param *kp) +{ + /* + * kdamond_pid is read-only, but kernel command line could write it. + * Do nothing here. + */ + return 0; +} + +static int damon_lru_sort_kdamond_pid_load(char *buffer, + const struct kernel_param *kp) +{ + int kdamond_pid = -1; + + if (ctx) { + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + kdamond_pid = -1; + } + return sprintf(buffer, "%d\n", kdamond_pid); +} + +static const struct kernel_param_ops kdamond_pid_param_ops = { + .set = damon_lru_sort_kdamond_pid_store, + .get = damon_lru_sort_kdamond_pid_load, +}; + +/* + * PID of the DAMON thread + * + * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400); + static int __init damon_lru_sort_init(void) { int err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 86da14778658..fe7fce26cf6c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -144,15 +144,6 @@ static unsigned long addr_unit __read_mostly = 1; static bool skip_anon __read_mostly; module_param(skip_anon, bool, 0600); -/* - * PID of the DAMON thread - * - * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. - * Else, -1. - */ -static int kdamond_pid __read_mostly = -1; -module_param(kdamond_pid, int, 0400); - static struct damos_stat damon_reclaim_stat; DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, reclaim_tried_regions, reclaimed_regions, quota_exceeds); @@ -288,12 +279,8 @@ static int damon_reclaim_turn(bool on) { int err; - if (!on) { - err = damon_stop(&ctx, 1); - if (!err) - kdamond_pid = -1; - return err; - } + if (!on) + return damon_stop(&ctx, 1); err = damon_reclaim_apply_parameters(); if (err) @@ -302,9 +289,6 @@ static int damon_reclaim_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = damon_kdamond_pid(ctx); - if (kdamond_pid < 0) - return kdamond_pid; return damon_call(ctx, &call_control); } @@ -332,42 +316,83 @@ module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); MODULE_PARM_DESC(addr_unit, "Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)"); +static bool damon_reclaim_enabled(void) +{ + if (!ctx) + return false; + return damon_is_running(ctx); +} + static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { - bool is_enabled = enabled; - bool enable; int err; - err = kstrtobool(val, &enable); + err = kstrtobool(val, &enabled); if (err) return err; - if (is_enabled == enable) + if (damon_reclaim_enabled() == enabled) return 0; /* Called before init function. The function will handle this. */ if (!damon_initialized()) - goto set_param_out; + return 0; - err = damon_reclaim_turn(enable); - if (err) - return err; + return damon_reclaim_turn(enabled); +} -set_param_out: - enabled = enable; - return err; +static int damon_reclaim_enabled_load(char *buffer, + const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", damon_reclaim_enabled() ? 'Y' : 'N'); } static const struct kernel_param_ops enabled_param_ops = { .set = damon_reclaim_enabled_store, - .get = param_get_bool, + .get = damon_reclaim_enabled_load, }; module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_RECLAIM (default: disabled)"); +static int damon_reclaim_kdamond_pid_store(const char *val, + const struct kernel_param *kp) +{ + /* + * kdamond_pid is read-only, but kernel command line could write it. + * Do nothing here. + */ + return 0; +} + +static int damon_reclaim_kdamond_pid_load(char *buffer, + const struct kernel_param *kp) +{ + int kdamond_pid = -1; + + if (ctx) { + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + kdamond_pid = -1; + } + return sprintf(buffer, "%d\n", kdamond_pid); +} + +static const struct kernel_param_ops kdamond_pid_param_ops = { + .set = damon_reclaim_kdamond_pid_store, + .get = damon_reclaim_kdamond_pid_load, +}; + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400); + static int __init damon_reclaim_init(void) { int err; diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 60351a719460..3951b762cbdd 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -19,14 +19,17 @@ static int damon_stat_enabled_store( const char *val, const struct kernel_param *kp); +static int damon_stat_enabled_load(char *buffer, + const struct kernel_param *kp); + static const struct kernel_param_ops enabled_param_ops = { .set = damon_stat_enabled_store, - .get = param_get_bool, + .get = damon_stat_enabled_load, }; static bool enabled __read_mostly = IS_ENABLED( CONFIG_DAMON_STAT_ENABLED_DEFAULT); -module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +module_param_cb(enabled, &enabled_param_ops, NULL, 0600); MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT"); static unsigned long estimated_memory_bandwidth __read_mostly; @@ -255,8 +258,11 @@ static int damon_stat_start(void) if (!damon_stat_context) return -ENOMEM; err = damon_start(&damon_stat_context, 1, true); - if (err) + if (err) { + damon_destroy_ctx(damon_stat_context); + damon_stat_context = NULL; return err; + } damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; @@ -270,17 +276,23 @@ static void damon_stat_stop(void) damon_stat_context = NULL; } +static bool damon_stat_enabled(void) +{ + if (!damon_stat_context) + return false; + return damon_is_running(damon_stat_context); +} + static int damon_stat_enabled_store( const char *val, const struct kernel_param *kp) { - bool is_enabled = enabled; int err; err = kstrtobool(val, &enabled); if (err) return err; - if (is_enabled == enabled) + if (damon_stat_enabled() == enabled) return 0; if (!damon_initialized()) @@ -290,16 +302,17 @@ static int damon_stat_enabled_store( */ return 0; - if (enabled) { - err = damon_stat_start(); - if (err) - enabled = false; - return err; - } + if (enabled) + return damon_stat_start(); damon_stat_stop(); return 0; } +static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N'); +} + static int __init damon_stat_init(void) { int err = 0; diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 5186966dafb3..245d63808411 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -533,9 +533,14 @@ static ssize_t memcg_path_show(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); + int len; - return sysfs_emit(buf, "%s\n", + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + len = sysfs_emit(buf, "%s\n", filter->memcg_path ? filter->memcg_path : ""); + mutex_unlock(&damon_sysfs_lock); + return len; } static ssize_t memcg_path_store(struct kobject *kobj, @@ -550,8 +555,13 @@ static ssize_t memcg_path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + if (!mutex_trylock(&damon_sysfs_lock)) { + kfree(path); + return -EBUSY; + } kfree(filter->memcg_path); filter->memcg_path = path; + mutex_unlock(&damon_sysfs_lock); return count; } @@ -1187,8 +1197,13 @@ static ssize_t path_show(struct kobject *kobj, { struct damos_sysfs_quota_goal *goal = container_of(kobj, struct damos_sysfs_quota_goal, kobj); + int len; - return sysfs_emit(buf, "%s\n", goal->path ? goal->path : ""); + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + len = sysfs_emit(buf, "%s\n", goal->path ? goal->path : ""); + mutex_unlock(&damon_sysfs_lock); + return len; } static ssize_t path_store(struct kobject *kobj, @@ -1203,8 +1218,13 @@ static ssize_t path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + if (!mutex_trylock(&damon_sysfs_lock)) { + kfree(path); + return -EBUSY; + } kfree(goal->path); goal->path = path; + mutex_unlock(&damon_sysfs_lock); return count; } diff --git a/mm/filemap.c b/mm/filemap.c index c568d9058ff8..4e636647100c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -228,7 +228,8 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) page_cache_delete(mapping, folio, shadow); } -void filemap_free_folio(struct address_space *mapping, struct folio *folio) +static void filemap_free_folio(const struct address_space *mapping, + struct folio *folio) { void (*free_folio)(struct folio *); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 42c983821c03..970e077019b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1218,13 +1218,29 @@ retry: static struct deferred_split *folio_split_queue_lock(struct folio *folio) { - return split_queue_lock(folio_nid(folio), folio_memcg(folio)); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock(folio_nid(folio), folio_memcg(folio)); + /* + * The memcg destruction path is acquiring the split queue lock for + * reparenting. Once you have it locked, it's safe to drop the rcu lock. + */ + rcu_read_unlock(); + + return queue; } static struct deferred_split * folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags) { - return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + struct deferred_split *queue; + + rcu_read_lock(); + queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags); + rcu_read_unlock(); + + return queue; } static inline void split_queue_unlock(struct deferred_split *queue) @@ -3994,7 +4010,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1); if (do_lru) - unlock_page_lruvec(lruvec); + lruvec_unlock(lruvec); if (ci) swap_cluster_unlock(ci); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9413ed497be5..f24bf49be047 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4218,6 +4218,9 @@ static __init int hugetlb_add_param(char *s, int (*setup)(char *)) size_t len; char *p; + if (!s) + return -EINVAL; + if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; @@ -4784,6 +4787,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops = { + .can_userfault = hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4797,6 +4812,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f83ae4998990..7693ccefd0c6 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -204,6 +204,7 @@ void __init hugetlb_cma_reserve(void) */ per_node = DIV_ROUND_UP(hugetlb_cma_size, nodes_weight(hugetlb_bootmem_nodes)); + per_node = round_up(per_node, PAGE_SIZE << order); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } diff --git a/mm/internal.h b/mm/internal.h index c693646e5b3f..5a2ddcf68e0b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -557,7 +557,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); @@ -1322,7 +1321,17 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); +static inline bool deferred_pages_enabled(void) +{ + return static_branch_unlikely(&deferred_pages); +} + bool __init deferred_grow_zone(struct zone *zone, unsigned int order); +#else +static inline bool deferred_pages_enabled(void) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void init_deferred_page(unsigned long pfn, int nid); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 9eba46212edf..655dc5ce3240 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -736,10 +736,10 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ - memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + memblock_free((void *)addr, KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; - memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); + memblock_free(kfence_metadata_init, KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index fa8201e23222..2eff0d6b622b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -241,7 +241,7 @@ static int kmemleak_skip_disable; /* If there are leaks that can be reported */ static bool kmemleak_found_leaks; -static bool kmemleak_verbose; +static bool kmemleak_verbose = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_VERBOSE); module_param_named(verbose, kmemleak_verbose, bool, 0600); static void kmemleak_disable(void); diff --git a/mm/memblock.c b/mm/memblock.c index 2505ce8b319c..a6a1c91e276d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/mutex.h> +#include <linux/string_helpers.h> #ifdef CONFIG_KEXEC_HANDOVER #include <linux/libfdt.h> @@ -384,26 +385,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u */ void __init memblock_discard(void) { - phys_addr_t addr, size; + phys_addr_t size; + void *addr; if (memblock.reserved.regions != memblock_reserved_init_regions) { - addr = __pa(memblock.reserved.regions); + addr = memblock.reserved.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); if (memblock_reserved_in_slab) - kfree(memblock.reserved.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { - addr = __pa(memblock.memory.regions); + addr = memblock.memory.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); if (memblock_memory_in_slab) - kfree(memblock.memory.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } memblock_memory = NULL; @@ -893,13 +895,81 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } +static unsigned long __free_reserved_area(phys_addr_t start, phys_addr_t end, + int poison) +{ + unsigned long pages = 0, pfn; + + if (deferred_pages_enabled()) { + WARN(1, "Cannot free reserved memory because of deferred initialization of the memory map"); + return 0; + } + + for_each_valid_pfn(pfn, PFN_UP(start), PFN_DOWN(end)) { + struct page *page = pfn_to_page(pfn); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from the kernel virtual + * address because some architectures use aliases. + * Going via physical address, pfn_to_page() and page_address() + * ensures that we get a _writeable_ alias for the memset(). + */ + direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); + if ((unsigned int)poison <= 0xFF) + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); + pages++; + } + return pages; +} + +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +{ + phys_addr_t start_pa, end_pa; + unsigned long pages; + + /* + * end is the first address past the region and it may be beyond what + * __pa() or __pa_symbol() can handle. + * Use the address included in the range for the conversion and add back + * 1 afterwards. + */ + if (__is_kernel((unsigned long)start)) { + start_pa = __pa_symbol(start); + end_pa = __pa_symbol(end - 1) + 1; + } else { + start_pa = __pa(start); + end_pa = __pa(end - 1) + 1; + } + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + if (start_pa < end_pa) + memblock_remove_range(&memblock.reserved, + start_pa, end_pa - start_pa); + } + + pages = __free_reserved_area(start_pa, end_pa, poison); + if (pages && s) + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); + + return pages; +} + /** * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ void __init_memblock memblock_free(void *ptr, size_t size) { @@ -913,17 +983,24 @@ void __init_memblock memblock_free(void *ptr, size_t size) * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; + int ret; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); - return memblock_remove_range(&memblock.reserved, base, size); + ret = memblock_remove_range(&memblock.reserved, base, size); + + if (slab_is_available()) + __free_reserved_area(base, base + size, -1); + + return ret; } int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, @@ -973,7 +1050,7 @@ __init void memmap_init_kho_scratch_pages(void) /* * Initialize struct pages for free scratch memory. * The struct pages for reserved scratch memory will be set up in - * reserve_bootmem_region() + * memmap_init_reserved_pages() */ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { @@ -1766,32 +1843,6 @@ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, return addr; } -/** - * memblock_free_late - free pages directly to buddy allocator - * @base: phys starting address of the boot memory block - * @size: size of the boot memory block in bytes - * - * This is only useful when the memblock allocator has already been torn - * down, but we are still initializing the system. Pages are released directly - * to the buddy allocator. - */ -void __init memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - phys_addr_t cursor, end; - - end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", - __func__, &base, &end, (void *)_RET_IP_); - kmemleak_free_part_phys(base, size); - cursor = PFN_UP(base); - end = PFN_DOWN(base + size); - - for (; cursor < end; cursor++) { - memblock_free_pages(cursor, 0); - totalram_pages_inc(); - } -} - /* * Remaining API functions */ @@ -2255,6 +2306,31 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } +/* + * Initialised pages do not have PageReserved set. This function is called + * for each reserved range and marks the pages PageReserved. + * When deferred initialization of struct pages is enabled it also ensures + * that struct pages are properly initialised. + */ +static void __init memmap_init_reserved_range(phys_addr_t start, + phys_addr_t end, int nid) +{ + unsigned long pfn; + + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); + + init_deferred_page(pfn, nid); + + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); + } +} + static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; @@ -2274,7 +2350,7 @@ repeat: end = start + region->size; if (memblock_is_nomap(region)) - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); memblock_set_node(start, region->size, &memblock.reserved, nid); } @@ -2299,7 +2375,7 @@ repeat: if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); } } } @@ -2449,7 +2525,7 @@ int reserve_mem_release_by_name(const char *name) return 0; start = phys_to_virt(map->start); - end = start + map->size - 1; + end = start + map->size; snprintf(buf, sizeof(buf), "reserve_mem:%s", name); free_reserved_area(start, end, 0, buf); map->size = 0; @@ -2525,7 +2601,7 @@ static int __init prepare_kho_fdt(void) if (err) goto err_unpreserve_fdt; - err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt)); if (err) goto err_unpreserve_fdt; @@ -2570,7 +2646,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void) if (fdt) return fdt; - err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL); if (err) { if (err != -ENOENT) pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", @@ -2657,23 +2733,25 @@ static int __init reserve_mem(char *p) int len; if (!p) - return -EINVAL; + goto err_param; /* Check if there's room for more reserved memory */ - if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) { + pr_err("reserve_mem: no more room for reserved memory\n"); return -EBUSY; + } oldp = p; size = memparse(p, &p); if (!size || p == oldp) - return -EINVAL; + goto err_param; if (*p != ':') - return -EINVAL; + goto err_param; align = memparse(p+1, &p); if (*p != ':') - return -EINVAL; + goto err_param; /* * memblock_phys_alloc() doesn't like a zero size align, @@ -2687,7 +2765,7 @@ static int __init reserve_mem(char *p) /* name needs to have length but not too big */ if (!len || len >= RESERVE_MEM_NAME_SIZE) - return -EINVAL; + goto err_param; /* Make sure that name has text */ for (p = name; *p; p++) { @@ -2695,11 +2773,13 @@ static int __init reserve_mem(char *p) break; } if (!*p) - return -EINVAL; + goto err_param; /* Make sure the name is not already used */ - if (reserve_mem_find_by_name(name, &start, &tmp)) + if (reserve_mem_find_by_name(name, &start, &tmp)) { + pr_err("reserve_mem: name \"%s\" was already used\n", name); return -EBUSY; + } /* Pick previous allocations up from KHO if available */ if (reserve_mem_kho_revive(name, size, align)) @@ -2707,16 +2787,22 @@ static int __init reserve_mem(char *p) /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); - if (!start) + if (!start) { + pr_err("reserve_mem: memblock allocation failed\n"); return -ENOMEM; + } reserved_mem_add(start, size, name); return 1; +err_param: + pr_err("reserve_mem: empty or malformed parameter\n"); + return -EINVAL; } __setup("reserve_mem=", reserve_mem); -#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", @@ -2763,10 +2849,8 @@ static int memblock_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(memblock_debug); -static int __init memblock_init_debugfs(void) +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { - struct dentry *root = debugfs_create_dir("memblock", NULL); - debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, @@ -2775,7 +2859,48 @@ static int __init memblock_init_debugfs(void) debugfs_create_file("physmem", 0444, root, &physmem, &memblock_debug_fops); #endif +} + +#else + +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { } + +#endif /* CONFIG_ARCH_KEEP_MEMBLOCK */ + +static int memblock_reserve_mem_show(struct seq_file *m, void *private) +{ + struct reserve_mem_table *map; + char txtsz[16]; + + guard(mutex)(&reserve_mem_lock); + for (int i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + + memset(txtsz, 0, sizeof(txtsz)); + string_get_size(map->size, 1, STRING_UNITS_2, txtsz, sizeof(txtsz)); + seq_printf(m, "%s\t\t(%s)\n", map->name, txtsz); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memblock_reserve_mem); + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root; + + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !reserved_mem_count) + return 0; + + root = debugfs_create_dir("memblock", NULL); + + if (reserved_mem_count) + debugfs_create_file("reserve_mem_param", 0444, root, NULL, + &memblock_reserve_mem_fops); + memblock_debugfs_expose_arrays(root); return 0; } __initcall(memblock_init_debugfs); diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 437cd25784fe..433bba9dfe71 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) void memcg1_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + struct obj_cgroup *objcg; unsigned int nr_entries; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -624,12 +625,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) if (!do_memsw_account()) return; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online @@ -644,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) folio_unqueue_deferred_split(folio); folio->memcg_data = 0; - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { @@ -665,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) preempt_enable_nested(); memcg1_check_events(memcg, folio_nid(folio)); - css_put(&memcg->css); + rcu_read_unlock(); + obj_cgroup_put(objcg); } /* @@ -1884,6 +1887,22 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) + reparent_memcg_state_local(memcg, parent, memcg1_stats[i]); +} + +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + int i; + + for (i = 0; i < NR_LRU_LISTS; i++) + reparent_memcg_lruvec_state_local(memcg, parent, i); +} + void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { unsigned long memory, memsw; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 1b969294ea6a..f92f81108d5e 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_memory, int nid); void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); +void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); +void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent); + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx); void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages); static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 051b82ebf371..c03d4787d466 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -206,26 +206,100 @@ static struct obj_cgroup *obj_cgroup_alloc(void) return objcg; } -static void memcg_reparent_objcgs(struct mem_cgroup *memcg, - struct mem_cgroup *parent) +static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg, + struct mem_cgroup *parent, + int nid) { struct obj_cgroup *objcg, *iter; + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid]; - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - - spin_lock_irq(&objcg_lock); - + objcg = rcu_replace_pointer(pn->objcg, NULL, true); /* 1) Ready to reparent active objcg. */ - list_add(&objcg->list, &memcg->objcg_list); + list_add(&objcg->list, &pn->objcg_list); /* 2) Reparent active objcg and already reparented objcgs to parent. */ - list_for_each_entry(iter, &memcg->objcg_list, list) + list_for_each_entry(iter, &pn->objcg_list, list) WRITE_ONCE(iter->memcg, parent); /* 3) Move already reparented objcgs to the parent's list */ - list_splice(&memcg->objcg_list, &parent->objcg_list); + list_splice(&pn->objcg_list, &parent_pn->objcg_list); + + return objcg; +} +#ifdef CONFIG_MEMCG_V1 +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force); + +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + /* + * Reparent stats exposed non-hierarchically. Flush @memcg's stats first + * to read its stats accurately , and conservatively flush @parent's + * stats after reparenting to avoid hiding a potentially large stat + * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()). + */ + __mem_cgroup_flush_stats(memcg, true); + + /* The following counts are all non-hierarchical and need to be reparented. */ + reparent_memcg1_state_local(memcg, parent); + reparent_memcg1_lruvec_state_local(memcg, parent); + + __mem_cgroup_flush_stats(parent, true); +} +#else +static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ +} +#endif + +static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_lock_irq(&objcg_lock); + spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1); + spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2); +} + +static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock); + spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock); spin_unlock_irq(&objcg_lock); +} + +static void memcg_reparent_objcgs(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + int nid; + + for_each_node(nid) { +retry: + if (lru_gen_enabled()) + max_lru_gen_memcg(parent, nid); + + reparent_locks(memcg, parent, nid); + + if (lru_gen_enabled()) { + if (!recheck_lru_gen_max_memcg(parent, nid)) { + reparent_unlocks(memcg, parent, nid); + cond_resched(); + goto retry; + } + lru_gen_reparent_memcg(memcg, parent, nid); + } else { + lru_reparent_memcg(memcg, parent, nid); + } - percpu_ref_kill(&objcg->refcnt); + objcg = __memcg_reparent_objcgs(memcg, parent, nid); + + reparent_unlocks(memcg, parent, nid); + + percpu_ref_kill(&objcg->refcnt); + } + + reparent_state_local(memcg, parent); } /* @@ -241,7 +315,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); EXPORT_SYMBOL(memcg_bpf_enabled_key); /** - * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated @@ -251,14 +325,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) - memcg = root_mem_cgroup; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - return &memcg->css; + memcg = get_mem_cgroup_from_folio(folio); + + return memcg ? &memcg->css : &root_mem_cgroup->css; } /** @@ -449,6 +525,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +#ifdef CONFIG_MEMCG_V1 +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val); + +void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + int nid; + + for_each_node(nid) { + struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + unsigned long value = lruvec_page_state_local(child_lruvec, idx); + struct mem_cgroup_per_node *child_pn, *parent_pn; + + child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec); + parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec); + + __mod_memcg_lruvec_state(child_pn, idx, -value); + __mod_memcg_lruvec_state(parent_pn, idx, value); + } +} +#endif + /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { #ifdef CONFIG_MEMCG_V1 @@ -508,7 +608,7 @@ static inline int memcg_events_index(enum vm_event_item idx) struct memcg_vmstats_percpu { /* Stats updates since the last flush */ - unsigned int stats_updates; + unsigned long stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ struct memcg_vmstats_percpu __percpu *parent_pcpu; @@ -539,7 +639,7 @@ struct memcg_vmstats { unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ - atomic_t stats_updates; + atomic_long_t stats_updates; }; /* @@ -565,16 +665,16 @@ static u64 flush_last_time; static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { - return atomic_read(&vmstats->stats_updates) > + return atomic_long_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, long val, int cpu) { struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - unsigned int stats_updates; + unsigned long stats_updates; if (!val) return; @@ -597,7 +697,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, continue; stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); - atomic_add(stats_updates, &statc->vmstats->stats_updates); + atomic_long_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -605,7 +705,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); - trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates), + trace_memcg_flush_stats(memcg, atomic_long_read(&memcg->vmstats->stats_updates), force, needs_flush); if (!force && !needs_flush) @@ -684,31 +784,70 @@ static int memcg_page_state_unit(int item); * Normalize the value passed into memcg_rstat_updated() to be in pages. Round * up non-zero sub-page updates to 1 page as zero page updates are ignored. */ -static int memcg_state_val_in_pages(int idx, int val) +static long memcg_state_val_in_pages(int idx, long val) { int unit = memcg_page_state_unit(idx); + long res; if (!val || unit == PAGE_SIZE) return val; - else - return max(val * unit / PAGE_SIZE, 1UL); + + /* Get the absolute value of (val * unit / PAGE_SIZE). */ + res = mult_frac(abs(val), unit, PAGE_SIZE); + /* Round up zero values. */ + res = res ? : 1; + + return val < 0 ? -res : res; } -/** - * mod_memcg_state - update cgroup memory statistics - * @memcg: the memory cgroup - * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item - * @val: delta to add to the counter, can be negative +#ifdef CONFIG_MEMCG_V1 +/* + * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with + * reparenting of non-hierarchical state_locals. */ -void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, - int val) +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg, + bool *rcu_locked) { - int i = memcg_stats_index(idx); - int cpu; + /* Rebinding can cause this value to be changed at runtime */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + *rcu_locked = false; + return memcg; + } - if (mem_cgroup_disabled()) + rcu_read_lock(); + *rcu_locked = true; + + while (memcg_is_dying(memcg)) + memcg = parent_mem_cgroup(memcg); + + return memcg; +} + +static inline void get_non_dying_memcg_end(bool rcu_locked) +{ + if (!rcu_locked) return; + rcu_read_unlock(); +} +#else +static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg, + bool *rcu_locked) +{ + return memcg; +} + +static inline void get_non_dying_memcg_end(bool rcu_locked) +{ +} +#endif + +static void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, long val) +{ + int i = memcg_stats_index(idx); + int cpu; + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -717,11 +856,31 @@ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val, cpu); + trace_mod_memcg_state(memcg, idx, val); put_cpu(); } +/** + * mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, + int val) +{ + bool rcu_locked = false; + + if (mem_cgroup_disabled()) + return; + + memcg = get_non_dying_memcg_start(memcg, &rcu_locked); + __mod_memcg_state(memcg, idx, val); + get_non_dying_memcg_end(rcu_locked); +} + #ifdef CONFIG_MEMCG_V1 /* idx can be of type enum memcg_stat_item or node_stat_item. */ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) @@ -739,23 +898,27 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) #endif return x; } + +void reparent_memcg_state_local(struct mem_cgroup *memcg, + struct mem_cgroup *parent, int idx) +{ + unsigned long value = memcg_page_state_local(memcg, idx); + + __mod_memcg_state(memcg, idx, -value); + __mod_memcg_state(parent, idx, value); +} #endif -static void mod_memcg_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, - int val) +static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn, + enum node_stat_item idx, long val) { - struct mem_cgroup_per_node *pn; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = pn->memcg; int i = memcg_stats_index(idx); int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - memcg = pn->memcg; - cpu = get_cpu(); /* Update memcg */ @@ -771,6 +934,24 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec, put_cpu(); } +static void mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, + int val) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + bool rcu_locked = false; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = get_non_dying_memcg_start(pn->memcg, &rcu_locked); + pn = memcg->nodeinfo[pgdat->node_id]; + + __mod_memcg_lruvec_state(pn, idx, val); + + get_non_dying_memcg_end(rcu_locked); +} + /** * mod_lruvec_state - update lruvec memory statistics * @lruvec: the lruvec @@ -991,17 +1172,23 @@ again: /** * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. * @folio: folio from which memcg should be extracted. + * + * See folio_memcg() for folio->objcg/memcg binding rules. */ struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return NULL; + if (!folio_memcg_charged(folio)) + return root_mem_cgroup; + rcu_read_lock(); - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; + do { + memcg = folio_memcg(folio); + } while (unlikely(!css_tryget(&memcg->css))); rcu_read_unlock(); return memcg; } @@ -1198,23 +1385,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, } } -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return; - - memcg = folio_memcg(folio); - - if (!memcg) - VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); - else - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); -} -#endif - /** * folio_lruvec_lock - Lock the lruvec for a folio. * @folio: Pointer to the folio. @@ -1224,14 +1394,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) * - folio_test_lru false * - folio frozen (refcount of 0) * - * Return: The lruvec this folio is on with its lock held. + * Return: The lruvec this folio is on with its lock held and rcu read lock held. */ struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1246,14 +1422,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irq(&lruvec->lru_lock); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + goto retry; + } return lruvec; } @@ -1269,15 +1451,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts - * disabled. + * disabled and rcu read lock held. */ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags) { - struct lruvec *lruvec = folio_lruvec(folio); + struct lruvec *lruvec; + rcu_read_lock(); +retry: + lruvec = folio_lruvec(folio); spin_lock_irqsave(&lruvec->lru_lock, *flags); - lruvec_memcg_debug(lruvec, folio); + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); + goto retry; + } return lruvec; } @@ -1293,7 +1481,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages) + int zid, long nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; @@ -1310,7 +1498,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, size = *lru_size; if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", + "%s(%p, %d, %ld): lru_size %ld\n", __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; @@ -2581,17 +2769,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +static void commit_charge(struct folio *folio, struct obj_cgroup *objcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* - * Any of the following ensures page's memcg stability: + * Any of the following ensures folio's objcg stability: * * - the page lock * - LRU isolation * - exclusive reference */ - folio->memcg_data = (unsigned long)memcg; + folio->memcg_data = (unsigned long)objcg; } #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC @@ -2693,14 +2881,26 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p) static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg = NULL; + int nid = numa_node_id(); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg); - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { - objcg = rcu_dereference(memcg->objcg); if (likely(objcg && obj_cgroup_tryget(objcg))) - break; - objcg = NULL; + return objcg; } + + return NULL; +} + +static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + + rcu_read_lock(); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + return objcg; } @@ -2759,6 +2959,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; struct obj_cgroup *objcg; + int nid = numa_node_id(); if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) return NULL; @@ -2775,53 +2976,39 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) * Objcg reference is kept by the task, so it's safe * to use the objcg by the current task. */ - return objcg; + return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } memcg = this_cpu_read(int_active_memcg); if (unlikely(memcg)) goto from_memcg; - return NULL; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); from_memcg: - objcg = NULL; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { /* * Memcg pointer is protected by scope (see set_active_memcg()) * and is pinning the corresponding objcg, so objcg can't go * away and can be used within the scope without any additional * protection. */ - objcg = rcu_dereference_check(memcg->objcg, 1); + objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1); if (likely(objcg)) - break; + return objcg; } - return objcg; + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); } struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; - if (!memcg_kmem_online()) - return NULL; - - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); + objcg = folio_objcg(folio); + if (objcg) obj_cgroup_get(objcg); - } else { - struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = __folio_memcg(folio); - if (memcg) - objcg = __get_obj_cgroup_from_memcg(memcg); - else - objcg = NULL; - rcu_read_unlock(); - } return objcg; } @@ -2922,7 +3109,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) int ret = 0; objcg = current_obj_cgroup(); - if (objcg) { + if (objcg && !obj_cgroup_is_root(objcg)) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { obj_cgroup_get(objcg); @@ -3251,7 +3438,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * obj_cgroup_get() is used to get a permanent reference. */ objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; /* @@ -3383,33 +3570,20 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, return; new_refs = (1 << (old_order - new_order)) - 1; - css_get_many(&__folio_memcg(folio)->css, new_refs); + obj_cgroup_get_many(folio_objcg(folio), new_refs); } -static int memcg_online_kmem(struct mem_cgroup *memcg) +static void memcg_online_kmem(struct mem_cgroup *memcg) { - struct obj_cgroup *objcg; - if (mem_cgroup_kmem_disabled()) - return 0; + return; if (unlikely(mem_cgroup_is_root(memcg))) - return 0; - - objcg = obj_cgroup_alloc(); - if (!objcg) - return -ENOMEM; - - objcg->memcg = memcg; - rcu_assign_pointer(memcg->objcg, objcg); - obj_cgroup_get(objcg); - memcg->orig_objcg = objcg; + return; static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; - - return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) @@ -3423,16 +3597,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) return; parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - memcg_reparent_list_lrus(memcg, parent); - - /* - * Objcg's reparenting must be after list_lru's, make sure list_lru - * helpers won't use parent's list_lru until child is drained. - */ - memcg_reparent_objcgs(memcg, parent); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -3705,8 +3870,6 @@ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, un break; } memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; } return memcg; } @@ -3771,6 +3934,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn->lruvec_stats_percpu) goto fail; + INIT_LIST_HEAD(&pn->objcg_list); + lruvec_init(&pn->lruvec); pn->memcg = memcg; @@ -3785,10 +3950,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + continue; - for_each_node(node) - free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); + obj_cgroup_put(pn->orig_objcg); + free_mem_cgroup_per_node_info(pn); + } memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); @@ -3859,7 +4028,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) #endif memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; - INIT_LIST_HEAD(&memcg->objcg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) @@ -3935,9 +4103,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct obj_cgroup *objcg; + int nid; - if (memcg_online_kmem(memcg)) - goto remove_id; + memcg_online_kmem(memcg); /* * A memcg must be visible for expand_shrinker_info() @@ -3947,6 +4116,20 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; + for_each_node(nid) { + objcg = obj_cgroup_alloc(); + if (!objcg) + goto free_objcg; + + if (unlikely(mem_cgroup_is_root(memcg))) + objcg->is_root = true; + + objcg->memcg = memcg; + rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg); + obj_cgroup_get(objcg); + memcg->nodeinfo[nid]->orig_objcg = objcg; + } + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); @@ -3969,9 +4152,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; +free_objcg: + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + + objcg = rcu_replace_pointer(pn->objcg, NULL, true); + if (objcg) + percpu_ref_kill(&objcg->refcnt); + + if (pn->orig_objcg) { + obj_cgroup_put(pn->orig_objcg); + /* + * Reset pn->orig_objcg to NULL to prevent + * obj_cgroup_put() from being called again in + * __mem_cgroup_free(). + */ + pn->orig_objcg = NULL; + } + } + free_shrinker_info(memcg); offline_kmem: memcg_offline_kmem(memcg); -remove_id: mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3989,6 +4190,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_deferred_split_queue(memcg); + /* + * The reparenting of objcg must be after the reparenting of the + * list_lru and deferred_split_queue above, which ensures that they will + * not mistakenly get the parent list_lru and deferred_split_queue. + */ + memcg_reparent_objcgs(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); lru_gen_offline_memcg(memcg); @@ -4221,8 +4428,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ - if (atomic_read(&memcg->vmstats->stats_updates)) - atomic_set(&memcg->vmstats->stats_updates, 0); + if (atomic_long_read(&memcg->vmstats->stats_updates)) + atomic_long_set(&memcg->vmstats->stats_updates, 0); } static void mem_cgroup_fork(struct task_struct *task) @@ -4799,16 +5006,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - int ret; - - ret = try_charge(memcg, gfp, folio_nr_pages(folio)); - if (ret) - goto out; + int ret = 0; + struct obj_cgroup *objcg; - css_get(&memcg->css); - commit_charge(folio, memcg); + objcg = get_obj_cgroup_from_memcg(memcg); + /* Do not account at the root objcg level. */ + if (!obj_cgroup_is_root(objcg)) + ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio)); + if (ret) { + obj_cgroup_put(objcg); + return ret; + } + commit_charge(folio, objcg); memcg1_commit_charge(folio, memcg); -out: + return ret; } @@ -4894,7 +5105,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, } struct uncharge_gather { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; @@ -4908,58 +5119,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(ug->objcg); if (ug->nr_memory) { - memcg_uncharge(ug->memcg, ug->nr_memory); + memcg_uncharge(memcg, ug->nr_memory); if (ug->nr_kmem) { - mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); - memcg1_account_kmem(ug->memcg, -ug->nr_kmem); + mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem); + memcg1_account_kmem(memcg, -ug->nr_kmem); } - memcg1_oom_recover(ug->memcg); + memcg1_oom_recover(memcg); } - memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); + memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid); + rcu_read_unlock(); /* drop reference from uncharge_folio */ - css_put(&ug->memcg->css); + obj_cgroup_put(ug->objcg); } static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) { long nr_pages; - struct mem_cgroup *memcg; struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Nobody should be changing or seriously looking at - * folio memcg or objcg at this point, we have fully - * exclusive access to the folio. + * folio objcg at this point, we have fully exclusive + * access to the folio. */ - if (folio_memcg_kmem(folio)) { - objcg = __folio_objcg(folio); - /* - * This get matches the put at the end of the function and - * kmem pages do not hold memcg references anymore. - */ - memcg = get_mem_cgroup_from_objcg(objcg); - } else { - memcg = __folio_memcg(folio); - } - - if (!memcg) + objcg = folio_objcg(folio); + if (!objcg) return; - if (ug->memcg != memcg) { - if (ug->memcg) { + if (ug->objcg != objcg) { + if (ug->objcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = memcg; + ug->objcg = objcg; ug->nid = folio_nid(folio); - /* pairs with css_put in uncharge_batch */ - css_get(&memcg->css); + /* pairs with obj_cgroup_put in uncharge_batch */ + obj_cgroup_get(objcg); } nr_pages = folio_nr_pages(folio); @@ -4967,20 +5172,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) if (folio_memcg_kmem(folio)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - - folio->memcg_data = 0; - obj_cgroup_put(objcg); } else { /* LRU pages aren't accounted at the root level */ - if (!mem_cgroup_is_root(memcg)) + if (!obj_cgroup_is_root(objcg)) ug->nr_memory += nr_pages; ug->pgpgout++; WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); - folio->memcg_data = 0; } - css_put(&memcg->css); + folio->memcg_data = 0; + obj_cgroup_put(objcg); } void __mem_cgroup_uncharge(struct folio *folio) @@ -5004,7 +5206,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) uncharge_gather_clear(&ug); for (i = 0; i < folios->nr; i++) uncharge_folio(folios->folios[i], &ug); - if (ug.memcg) + if (ug.objcg) uncharge_batch(&ug); } @@ -5021,6 +5223,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; + struct obj_cgroup *objcg; long nr_pages = folio_nr_pages(new); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); @@ -5035,21 +5238,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) if (folio_memcg_charged(new)) return; - memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); - if (!memcg) + objcg = folio_objcg(old); + VM_WARN_ON_ONCE_FOLIO(!objcg, old); + if (!objcg) return; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); /* Force-charge the new page. The old one will be freed soon */ - if (!mem_cgroup_is_root(memcg)) { + if (!obj_cgroup_is_root(objcg)) { page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); } - css_get(&memcg->css); - commit_charge(new, memcg); + obj_cgroup_get(objcg); + commit_charge(new, objcg); memcg1_commit_charge(new, memcg); + rcu_read_unlock(); } /** @@ -5065,7 +5271,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) */ void mem_cgroup_migrate(struct folio *old, struct folio *new) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -5076,18 +5282,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) if (mem_cgroup_disabled()) return; - memcg = folio_memcg(old); + objcg = folio_objcg(old); /* - * Note that it is normal to see !memcg for a hugetlb folio. + * Note that it is normal to see !objcg for a hugetlb folio. * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ - VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); - if (!memcg) + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old); + if (!objcg) return; - /* Transfer the charge and the css ref */ - commit_charge(new, memcg); + /* Transfer the charge and the objcg ref */ + commit_charge(new, objcg); /* Warning should never happen, so don't worry about refcount non-0 */ WARN_ON_ONCE(folio_unqueue_deferred_split(old)); @@ -5270,22 +5476,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; if (do_memsw_account()) return 0; - memcg = folio_memcg(folio); - - VM_WARN_ON_ONCE_FOLIO(!memcg, folio); - if (!memcg) + objcg = folio_objcg(folio); + VM_WARN_ON_ONCE_FOLIO(!objcg, folio); + if (!objcg) return 0; + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + rcu_read_unlock(); return 0; } memcg = mem_cgroup_private_id_get_online(memcg, nr_pages); + /* memcg is pined by memcg ID. */ + rcu_read_unlock(); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { @@ -5343,27 +5554,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; + bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; - if (do_memsw_account()) - return false; + if (do_memsw_account() || !folio_memcg_charged(folio)) + return ret; + rcu_read_lock(); memcg = folio_memcg(folio); - if (!memcg) - return false; - for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || - usage * 2 >= READ_ONCE(memcg->swap.max)) - return true; + usage * 2 >= READ_ONCE(memcg->swap.max)) { + ret = true; + break; + } } + rcu_read_unlock(); - return false; + return ret; } static int __init setup_swap_account(char *s) @@ -5559,6 +5772,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); /* PF_MEMALLOC context, charging must succeed */ @@ -5588,6 +5804,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + if (obj_cgroup_is_root(objcg)) + return; + obj_cgroup_uncharge(objcg, size); rcu_read_lock(); diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index bc7f4f045edf..59de210bee5f 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -50,6 +50,11 @@ * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property * is maintained. * + * Seals + * File seals set on the memfd are preserved and re-applied on restore. + * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may + * be present; preservation fails with ``-EOPNOTSUPP`` otherwise. + * * Non-Preserved Properties * ======================== * @@ -61,10 +66,6 @@ * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set * again after restore via ``fcntl()``. - * - * Seals - * File seals are not preserved. The file is unsealed on restore and if - * needed, must be sealed again via ``fcntl()``. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -105,7 +106,6 @@ static int memfd_luo_preserve_folios(struct file *file, if (!size) { *nr_foliosp = 0; *out_folios_ser = NULL; - memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); return 0; } @@ -260,7 +260,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct inode *inode = file_inode(args->file); struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; - u64 nr_folios; + u64 nr_folios, inode_size; int err = 0, seals; inode_lock(inode); @@ -286,7 +286,18 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) } ser->pos = args->file->f_pos; - ser->size = i_size_read(inode); + inode_size = i_size_read(inode); + + /* + * memfd_pin_folios() caps at UINT_MAX folios; refuse larger + * files to avoid silently preserving only a prefix. + */ + if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) { + err = -EFBIG; + goto err_free_ser; + } + + ser->size = inode_size; ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, @@ -410,6 +421,7 @@ static int memfd_luo_retrieve_folios(struct file *file, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct folio *folio; + long npages, nr_added_pages = 0; int err = -EIO; long i; @@ -427,6 +439,7 @@ static int memfd_luo_retrieve_folios(struct file *file, if (!folio) { pr_err("Unable to restore folio at physical address: %llx\n", phys); + err = -EIO; goto put_folios; } index = pfolio->index; @@ -456,21 +469,26 @@ static int memfd_luo_retrieve_folios(struct file *file, if (flags & MEMFD_LUO_FOLIO_DIRTY) folio_mark_dirty(folio); - err = shmem_inode_acct_blocks(inode, 1); + npages = folio_nr_pages(folio); + err = shmem_inode_acct_blocks(inode, npages); if (err) { - pr_err("shmem: failed to account folio index %ld: %d\n", - i, err); - goto unlock_folio; + pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n", + i, npages, err); + goto remove_from_cache; } - shmem_recalc_inode(inode, 1, 0); + nr_added_pages += npages; folio_add_lru(folio); folio_unlock(folio); folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return 0; +remove_from_cache: + filemap_remove_folio(folio); unlock_folio: folio_unlock(folio); folio_put(folio); @@ -481,12 +499,19 @@ put_folios: */ for (long j = i + 1; j < nr_folios; j++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; + phys_addr_t phys; + + if (!pfolio->pfn) + continue; - folio = kho_restore_folio(pfolio->pfn); + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); if (folio) folio_put(folio); } + shmem_recalc_inode(inode, nr_added_pages, 0); + return err; } @@ -525,7 +550,7 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) } vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); - file->f_inode->i_size = ser->size; + i_size_write(file_inode(file), ser->size); if (ser->nr_folios) { folios_ser = kho_restore_vmalloc(&ser->folios); @@ -560,6 +585,11 @@ static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, return shmem_file(file) && !inode->i_nlink; } +static unsigned long memfd_luo_get_id(struct file *file) +{ + return (unsigned long)file_inode(file); +} + static const struct liveupdate_file_ops memfd_luo_file_ops = { .freeze = memfd_luo_freeze, .finish = memfd_luo_finish, @@ -567,6 +597,7 @@ static const struct liveupdate_file_ops memfd_luo_file_ops = { .preserve = memfd_luo_preserve, .unpreserve = memfd_luo_unpreserve, .can_preserve = memfd_luo_can_preserve, + .get_id = memfd_luo_get_id, .owner = THIS_MODULE, }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2e136b738889..4e4421b22b59 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3706,18 +3706,19 @@ static ssize_t weighted_interleave_auto_store(struct kobject *kobj, new_wi_state->iw_table[i] = 1; mutex_lock(&wi_state_lock); - if (!input) { - old_wi_state = rcu_dereference_protected(wi_state, - lockdep_is_held(&wi_state_lock)); - if (!old_wi_state) - goto update_wi_state; - if (input == old_wi_state->mode_auto) { - mutex_unlock(&wi_state_lock); - return count; - } + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); - memcpy(new_wi_state->iw_table, old_wi_state->iw_table, - nr_node_ids * sizeof(u8)); + if (old_wi_state && input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return count; + } + + if (!input) { + if (old_wi_state) + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); goto update_wi_state; } @@ -3787,9 +3788,11 @@ static void wi_state_free(void) } } -static struct kobj_attribute wi_auto_attr = - __ATTR(auto, 0664, weighted_interleave_auto_show, - weighted_interleave_auto_store); +static struct kobj_attribute wi_auto_attr = { + .attr = { .name = "auto", .mode = 0664 }, + .show = weighted_interleave_auto_show, + .store = weighted_interleave_auto_store, +}; static void wi_cleanup(void) { sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); diff --git a/mm/memremap.c b/mm/memremap.c index ac7be07e3361..053842d45cb1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -454,7 +454,7 @@ void free_zone_device_folio(struct folio *folio) if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free)) break; pgmap->ops->folio_free(folio); - percpu_ref_put_many(&folio->pgmap->ref, nr); + percpu_ref_put_many(&pgmap->ref, nr); break; case MEMORY_DEVICE_GENERIC: diff --git a/mm/migrate.c b/mm/migrate.c index 76142a02192b..8a64291ab5b4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -672,6 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; + rcu_read_lock(); memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); @@ -699,6 +700,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } + rcu_read_unlock(); } local_irq_enable(); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 2912eba575d5..fbfe5715f635 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -175,12 +175,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, return migrate_vma_collect_skip(start, end, walk); } - if (softleaf_is_migration(entry)) { - softleaf_entry_wait_on_locked(entry, ptl); - spin_unlock(ptl); - return -EAGAIN; - } - if (softleaf_is_device_private_write(entry)) write = MIGRATE_PFN_WRITE; } else { diff --git a/mm/mlock.c b/mm/mlock.c index fdbd1434a35f..8c227fefa2df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) } if (lruvec) - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folios_put(fbatch); } diff --git a/mm/mm_init.c b/mm/mm_init.c index 79f93f2a90cf..f9f8e1af921c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -783,31 +783,6 @@ void __meminit init_deferred_page(unsigned long pfn, int nid) __init_deferred_page(pfn, nid); } -/* - * Initialised pages do not have PageReserved set. This function is - * called for each range allocated by the bootmem allocator and - * marks the pages PageReserved. The remaining valid pages are later - * sent to the buddy page allocator. - */ -void __meminit reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid) -{ - unsigned long pfn; - - for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { - struct page *page = pfn_to_page(pfn); - - __init_deferred_page(pfn, nid); - - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } -} - /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) diff --git a/mm/mprotect.c b/mm/mprotect.c index 110d47a36d4b..9cbf932b028c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep, } /* Set nr_ptes number of ptes, starting from idx */ -static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, - int idx, bool set_write, struct mmu_gather *tlb) +static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, + int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb) { /* * Advance the position in the batch by idx; note that if idx > 0, @@ -143,7 +143,7 @@ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long add * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce * that the ptes point to consecutive pages of the same anon large folio. */ -static int page_anon_exclusive_sub_batch(int start_idx, int max_len, +static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len, struct page *first_page, bool expected_anon_exclusive) { int idx; @@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len, * pte of the batch. Therefore, we must individually check all pages and * retrieve sub-batches. */ -static void commit_anon_folio_batch(struct vm_area_struct *vma, +static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma, struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -188,7 +188,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma, } } -static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, +static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb) { @@ -211,6 +211,111 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma, commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb); } +static long change_softleaf_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags) +{ + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + softleaf_t entry = softleaf_from_pte(oldpte); + pte_t newpte; + + if (softleaf_is_migration_write(entry)) { + const struct folio *folio = softleaf_to_folio(entry); + + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (folio_test_anon(folio)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + } else if (softleaf_is_device_private_write(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_nonpresent_pte() for explanation. + */ + entry = make_readable_device_private_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (softleaf_is_marker(entry)) { + /* + * Ignore error swap entries unconditionally, + * because any access should sigbus/sigsegv + * anyway. + */ + if (softleaf_is_poison_marker(entry) || + softleaf_is_guard_marker(entry)) + return 0; + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + return 1; + } + return 0; + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + return 1; + } + return 0; +} + +static __always_inline void change_present_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + int nr_ptes, unsigned long end, pgprot_t newprot, + struct folio *folio, struct page *page, unsigned long cp_flags) +{ + const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + const bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + pte_t ptent, oldpte; + + oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes); + ptent = pte_modify(oldpte, newprot); + + if (uffd_wp) + ptent = pte_mkuffd_wp(ptent); + else if (uffd_wp_resolve) + ptent = pte_clear_uffd_wp(ptent); + + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent)) + set_write_prot_commit_flush_ptes(vma, folio, page, + addr, ptep, oldpte, ptent, nr_ptes, tlb); + else + prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent, + nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); +} + static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -221,7 +326,6 @@ static long change_pte_range(struct mmu_gather *tlb, bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; - bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int nr_ptes; tlb_change_page_size(tlb, PAGE_SIZE); @@ -242,7 +346,6 @@ static long change_pte_range(struct mmu_gather *tlb, int max_nr_ptes = (end - addr) >> PAGE_SHIFT; struct folio *folio = NULL; struct page *page; - pte_t ptent; /* Already in the desired state. */ if (prot_numa && pte_protnone(oldpte)) @@ -268,34 +371,20 @@ static long change_pte_range(struct mmu_gather *tlb, nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags); - oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes); - ptent = pte_modify(oldpte, newprot); - - if (uffd_wp) - ptent = pte_mkuffd_wp(ptent); - else if (uffd_wp_resolve) - ptent = pte_clear_uffd_wp(ptent); - /* - * In some writable, shared mappings, we might want - * to catch actual write access -- see - * vma_wants_writenotify(). - * - * In all writable, private mappings, we have to - * properly handle COW. - * - * In both cases, we can sometimes still change PTEs - * writable and avoid the write-fault handler, for - * example, if a PTE is already dirty and no other - * COW or special handling is required. + * Optimize for the small-folio common case by + * special-casing it here. Compiler constant propagation + * plus copious amounts of __always_inline does wonders. */ - if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent)) - set_write_prot_commit_flush_ptes(vma, folio, page, - addr, pte, oldpte, ptent, nr_ptes, tlb); - else - prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent, - nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb); + if (likely(nr_ptes == 1)) { + change_present_ptes(tlb, vma, addr, pte, 1, + end, newprot, folio, page, cp_flags); + } else { + change_present_ptes(tlb, vma, addr, pte, + nr_ptes, end, newprot, folio, page, + cp_flags); + } + pages += nr_ptes; } else if (pte_none(oldpte)) { /* @@ -317,66 +406,7 @@ static long change_pte_range(struct mmu_gather *tlb, pages++; } } else { - softleaf_t entry = softleaf_from_pte(oldpte); - pte_t newpte; - - if (softleaf_is_migration_write(entry)) { - const struct folio *folio = softleaf_to_folio(entry); - - /* - * A protection check is difficult so - * just be safe and disable write - */ - if (folio_test_anon(folio)) - entry = make_readable_exclusive_migration_entry( - swp_offset(entry)); - else - entry = make_readable_migration_entry(swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - } else if (softleaf_is_device_private_write(entry)) { - /* - * We do not preserve soft-dirtiness. See - * copy_nonpresent_pte() for explanation. - */ - entry = make_readable_device_private_entry( - swp_offset(entry)); - newpte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (softleaf_is_marker(entry)) { - /* - * Ignore error swap entries unconditionally, - * because any access should sigbus/sigsegv - * anyway. - */ - if (softleaf_is_poison_marker(entry) || - softleaf_is_guard_marker(entry)) - continue; - /* - * If this is uffd-wp pte marker and we'd like - * to unprotect it, drop it; the next page - * fault will trigger without uffd trapping. - */ - if (uffd_wp_resolve) { - pte_clear(vma->vm_mm, addr, pte); - pages++; - } - continue; - } else { - newpte = oldpte; - } - - if (uffd_wp) - newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - - if (!pte_same(oldpte, newpte)) { - set_pte_at(vma->vm_mm, addr, pte, newpte); - pages++; - } + pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags); } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); lazy_mmu_mode_disable(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 88cd53d4ba09..833f743f309f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1835,7 +1835,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) + if (!writeback_in_progress(wb) && + (nr_dirty > gdtc->bg_thresh || + (strictlimit && gdtc->wb_dirty > gdtc->wb_bg_thresh))) wb_start_background_writeback(wb); /* @@ -1862,15 +1864,9 @@ free_running: * Unconditionally start background writeback if it's not * already in progress. We need to do this because the global * dirty threshold check above (nr_dirty > gdtc->bg_thresh) - * doesn't account for these cases: - * - * a) strictlimit BDIs: throttling is calculated using per-wb - * thresholds. The per-wb threshold can be exceeded even when - * nr_dirty < gdtc->bg_thresh - * - * b) memcg-based throttling: memcg uses its own dirty count and - * thresholds and can trigger throttling even when global - * nr_dirty < gdtc->bg_thresh + * doesn't account for the memcg-based throttling case. memcg + * uses its own dirty count and thresholds and can trigger + * throttling even when global nr_dirty < gdtc->bg_thresh * * Writeback needs to be started else the writer stalls in the * throttle loop waiting for dirty pages to be written back diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 111b54df8a3c..227d58dc3de6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -297,11 +297,6 @@ int page_group_by_mobility_disabled __read_mostly; */ DEFINE_STATIC_KEY_TRUE(deferred_pages); -static inline bool deferred_pages_enabled(void) -{ - return static_branch_unlikely(&deferred_pages); -} - /* * deferred_grow_zone() is __init, but it is called from * get_page_from_freelist() during early boot until deferred_pages permanently @@ -314,11 +309,6 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) return deferred_grow_zone(zone, order); } #else -static inline bool deferred_pages_enabled(void) -{ - return false; -} - static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) { return false; @@ -1252,10 +1242,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task, union pgtag_ref_handle handle; union codetag_ref ref; - if (get_page_tag_ref(page, &ref, &handle)) { + if (likely(get_page_tag_ref(page, &ref, &handle))) { alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); update_page_tag_ref(handle, &ref); put_page_tag_ref(handle); + } else { + /* + * page_ext is not available yet, record the pfn so we can + * clear the tag ref later when page_ext is initialized. + */ + alloc_tag_add_early_pfn(page_to_pfn(page)); + if (task->alloc_tag) + alloc_tag_set_inaccurate(task->alloc_tag); } } @@ -6211,42 +6209,6 @@ void adjust_managed_page_count(struct page *page, long count) } EXPORT_SYMBOL(adjust_managed_page_count); -unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) -{ - void *pos; - unsigned long pages = 0; - - start = (void *)PAGE_ALIGN((unsigned long)start); - end = (void *)((unsigned long)end & PAGE_MASK); - for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - struct page *page = virt_to_page(pos); - void *direct_map_addr; - - /* - * 'direct_map_addr' might be different from 'pos' - * because some architectures' virt_to_page() - * work with aliases. Getting the direct map - * address ensures that we get a _writeable_ - * alias for the memset(). - */ - direct_map_addr = page_address(page); - /* - * Perform a kasan-unchecked memset() since this memory - * has not been initialized. - */ - direct_map_addr = kasan_reset_tag(direct_map_addr); - if ((unsigned int)poison <= 0xFF) - memset(direct_map_addr, poison, PAGE_SIZE); - - free_reserved_page(page); - } - - if (pages && s) - pr_info("Freeing %s memory: %ldK\n", s, K(pages)); - - return pages; -} - void free_reserved_page(struct page *page) { clear_page_tag_ref(page); @@ -7775,6 +7737,11 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + + /* On UP, spin_trylock() always succeeds even when it is locked */ + if (!IS_ENABLED(CONFIG_SMP) && in_nmi()) + return NULL; + if (!pcp_allowed_order(order)) return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index 330abc5ab7b4..70cea9e24d2f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -276,10 +276,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); goto out_unlock; } + + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { + rcu_read_unlock(); folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; } + rcu_read_unlock(); __swap_writepage(folio, swap_plug); return 0; @@ -307,11 +311,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) struct cgroup_subsys_state *css; struct mem_cgroup *memcg; - memcg = folio_memcg(folio); - if (!memcg) + if (!folio_memcg_charged(folio)) return; rcu_read_lock(); + memcg = folio_memcg(folio); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); @@ -493,7 +497,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) folio_mark_uptodate(folio); folio_unlock(folio); } - count_vm_events(PSWPIN, sio->pages); + count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT); } else { for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2708c2b3ac1f..53a8997ec043 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -151,9 +151,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pte_user_accessible_page(pte, addr)) { + if (pte_user_accessible_page(mm, addr, pte)) page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pte_clear); @@ -163,9 +162,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pmd_user_accessible_page(pmd, addr)) { + if (pmd_user_accessible_page(mm, addr, pmd)) page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@ -175,9 +173,8 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - if (pud_user_accessible_page(pud, addr)) { + if (pud_user_accessible_page(mm, addr, pud)) page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); - } } EXPORT_SYMBOL(__page_table_check_pud_clear); @@ -211,7 +208,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); - if (pte_user_accessible_page(pte, addr)) + if (pte_user_accessible_page(mm, addr, pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); @@ -241,7 +238,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); - if (pmd_user_accessible_page(pmd, addr)) + if (pmd_user_accessible_page(mm, addr, pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } EXPORT_SYMBOL(__page_table_check_pmds_set); @@ -257,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, for (i = 0; i < nr; i++) __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); - if (pud_user_accessible_page(pud, addr)) + if (pud_user_accessible_page(mm, addr, pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } EXPORT_SYMBOL(__page_table_check_puds_set); diff --git a/mm/percpu.c b/mm/percpu.c index a2107bdebf0b..b0676b8054ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, return true; objcg = current_obj_cgroup(); - if (!objcg) + if (!objcg || obj_cgroup_is_root(objcg)) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) diff --git a/mm/shmem.c b/mm/shmem.c index 19bf77925fa1..3b5dc21b323c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3177,119 +3177,99 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) -{ - struct inode *inode = file_inode(dst_vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); +static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - void *page_kaddr; struct folio *folio; - int ret; - pgoff_t max_off; - if (shmem_inode_acct_blocks(inode, 1)) { - /* - * We may have got a page, returned -ENOENT triggering a retry, - * and now we find ourselves with -ENOMEM. Release the page, to - * avoid a BUG_ON in our caller. - */ - if (unlikely(*foliop)) { - folio_put(*foliop); - *foliop = NULL; - } - return -ENOMEM; - } - - if (!*foliop) { - ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, 0, info, pgoff); - if (!folio) - goto out_unacct_blocks; + if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) + return NULL; - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { - page_kaddr = kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret = copy_from_user(page_kaddr, - (const void __user *)src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(page_kaddr); - - /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - *foliop = folio; - ret = -ENOENT; - /* don't free the page */ - goto out_unacct_blocks; - } + folio = shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + return NULL; - flush_dcache_folio(folio); - } else { /* ZEROPAGE */ - clear_user_highpage(&folio->page, dst_addr); - } - } else { - folio = *foliop; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *foliop = NULL; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; } - VM_BUG_ON(folio_test_locked(folio)); - VM_BUG_ON(folio_test_swapbacked(folio)); + return folio; +} + +static int shmem_mfill_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + struct address_space *mapping = inode->i_mapping; + pgoff_t pgoff = linear_page_index(vma, addr); + gfp_t gfp = mapping_gfp_mask(mapping); + int err; + __folio_set_locked(folio); __folio_set_swapbacked(folio); - __folio_mark_uptodate(folio); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(pgoff >= max_off)) - goto out_release; - ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); - if (ret) - goto out_release; + err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (err) + goto err_unlock; - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); - if (ret) - goto out_delete_from_cache; + if (shmem_inode_acct_blocks(inode, 1)) { + err = -ENOMEM; + goto err_delete_from_cache; + } + folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); - folio_unlock(folio); + return 0; -out_delete_from_cache: + +err_delete_from_cache: filemap_remove_folio(folio); -out_release: +err_unlock: folio_unlock(folio); - folio_put(folio); -out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); - return ret; + return err; } + +static void shmem_mfill_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); + folio_unlock(folio); +} + +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) +{ + struct folio *folio; + int err; + + err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} + +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops = { + .can_userfault = shmem_can_userfault, + .get_folio_noalloc = shmem_get_folio_noalloc, + .alloc_folio = shmem_mfill_folio_alloc, + .filemap_add = shmem_mfill_filemap_add, + .filemap_remove = shmem_mfill_filemap_remove, +}; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS @@ -5325,6 +5305,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5334,6 +5317,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/shrinker.c b/mm/shrinker.c index c23086bccf4d..76b3f750cf65 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -288,14 +288,10 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int nid, index, offset; long nr; - struct mem_cgroup *parent; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct shrinker_info *child_info, *parent_info; struct shrinker_info_unit *child_unit, *parent_unit; - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - /* Prevent from concurrent shrinker_info expand */ mutex_lock(&shrinker_mutex); for_each_node(nid) { diff --git a/mm/slub.c b/mm/slub.c index 92362eeb13e5..0baa906f39ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5339,6 +5339,10 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + /* On UP, spin_trylock() always succeeds even when it is locked */ + if (!IS_ENABLED(CONFIG_SMP) && in_nmi()) + return NULL; + retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; @@ -6645,16 +6649,6 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, if (!kasan_check_byte(p)) return NULL; - /* - * If reallocation is not necessary (e. g. the new size is less - * than the current allocated size), the current allocation will be - * preserved unless __GFP_THISNODE is set. In the latter case a new - * allocation on the requested node will be attempted. - */ - if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && - nid != page_to_nid(virt_to_page(p))) - goto alloc_new; - if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -6673,6 +6667,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, } } + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + /* If the old object doesn't fit, allocate a bigger one */ if (new_size > ks) goto alloc_new; @@ -6707,7 +6711,7 @@ alloc_new: if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); - memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + memcpy(ret, kasan_reset_tag(p), min(new_size, (size_t)(orig_size ?: ks))); kasan_enable_current(); } @@ -6941,7 +6945,7 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig if (p) { /* We already know that `p` is not a vmalloc address. */ kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); + memcpy(n, kasan_reset_tag(p), min(size, ksize(p))); kasan_enable_current(); kfree(p); diff --git a/mm/sparse.c b/mm/sparse.c index 007fd52c621e..effdac6b0ab1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -403,7 +403,6 @@ failed: ms = __nr_to_section(pnum); if (!preinited_vmemmap_section(ms)) ms->section_mem_map = 0; - ms->section_mem_map = 0; } } diff --git a/mm/swap.c b/mm/swap.c index 78b4aa811fc6..5cc44f0de987 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio) __page_cache_release(folio, &lruvec, &flags); if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) @@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); folios_put(fbatch); } @@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio) void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) __releases(lruvec->lru_lock) + __releases(rcu) { unsigned long cost; @@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; if (!cost) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); return; } @@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, spin_unlock_irq(&lruvec->lru_lock); lruvec = parent_lruvec(lruvec); - if (!lruvec) + if (!lruvec) { + rcu_read_unlock(); break; + } spin_lock_irq(&lruvec->lru_lock); } } @@ -349,7 +353,7 @@ void folio_activate(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); folio_set_lru(folio); } #endif @@ -412,18 +416,20 @@ static void lru_gen_inc_refs(struct folio *folio) static bool lru_gen_clear_refs(struct folio *folio) { - struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); + unsigned long seq; if (gen < 0) return true; set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); - lrugen = &folio_lruvec(folio)->lrugen; + rcu_read_lock(); + seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]); + rcu_read_unlock(); /* whether can do without shuffling under the LRU lock */ - return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); + return gen == lru_gen_from_seq(seq); } #else /* !CONFIG_LRU_GEN */ @@ -963,7 +969,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (folio_is_zone_device(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } if (folio_ref_sub_and_test(folio, nr_refs)) @@ -977,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); @@ -991,7 +997,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) j++; } if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec_unlock_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; @@ -1084,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +#ifdef CONFIG_MEMCG +static void lruvec_reparent_lru(struct lruvec *child_lruvec, + struct lruvec *parent_lruvec, + enum lru_list lru, int nid) +{ + int zid; + struct zone *zone; + + if (lru != LRU_UNEVICTABLE) + list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } +} + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + enum lru_list lru; + struct lruvec *child_lruvec, *parent_lruvec; + + child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid)); + parent_lruvec->anon_cost += child_lruvec->anon_cost; + parent_lruvec->file_cost += child_lruvec->file_cost; + + for_each_lru(lru) + lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid); +} +#endif + static const struct ctl_table swap_sysctl_table[] = { { .procname = "page-cluster", diff --git a/mm/truncate.c b/mm/truncate.c index 2931d66c16d0..12cc89f89afc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -622,6 +622,7 @@ static int folio_launder(struct address_space *mapping, struct folio *folio) int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp) { + void (*free_folio)(struct folio *); int ret; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -648,9 +649,12 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); + free_folio = mapping->a_ops->free_folio; spin_unlock(&mapping->host->i_lock); - filemap_free_folio(mapping, folio); + if (free_folio) + free_folio(folio); + folio_put_refs(folio, folio_nr_pages(folio)); return 1; failed: xa_unlock_irq(&mapping->i_pages); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 89879c3ba344..180bad42fc79 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,12 +14,61 @@ #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> -#include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" +struct mfill_state { + struct userfaultfd_ctx *ctx; + unsigned long src_start; + unsigned long dst_start; + unsigned long len; + uffd_flags_t flags; + + struct vm_area_struct *vma; + unsigned long src_addr; + unsigned long dst_addr; + pmd_t *pmd; +}; + +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static struct folio *anon_alloc_folio(struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr); + + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +static const struct vm_uffd_ops anon_uffd_ops = { + .can_userfault = anon_can_userfault, + .alloc_folio = anon_alloc_folio, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -143,6 +192,128 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma) } #endif +static void mfill_put_vma(struct mfill_state *state) +{ + if (!state->vma) + return; + + up_read(&state->ctx->map_changing_lock); + uffd_mfill_unlock(state->vma); + state->vma = NULL; +} + +static int mfill_get_vma(struct mfill_state *state) +{ + struct userfaultfd_ctx *ctx = state->ctx; + uffd_flags_t flags = state->flags; + struct vm_area_struct *dst_vma; + const struct vm_uffd_ops *ops; + int err; + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); + if (IS_ERR(dst_vma)) + return PTR_ERR(dst_vma); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + state->vma = dst_vma; + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err = -EINVAL; + + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + if (is_vm_hugetlb_page(dst_vma)) + return 0; + + ops = vma_uffd_ops(dst_vma); + if (!ops) + goto out_unlock; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && + !ops->get_folio_noalloc) + goto out_unlock; + + return 0; + +out_unlock: + mfill_put_vma(state); + return err; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +static int mfill_establish_pmd(struct mfill_state *state) +{ + struct mm_struct *dst_mm = state->ctx->mm; + pmd_t *dst_pmd, dst_pmdval; + + dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); + if (unlikely(!dst_pmd)) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) + return -ENOMEM; + + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) + return -EEXIST; + if (unlikely(pmd_bad(dst_pmdval))) + return -EFAULT; + + state->pmd = dst_pmd; + return 0; +} + /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -165,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags) +static int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -212,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, goto out_unlock; if (page_in_cache) { - /* Usually, cache pages are already added to LRU */ - if (newly_allocated) - folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); @@ -229,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + if (page_in_cache) + folio_unlock(folio); + /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -238,58 +409,110 @@ out: return ret; } -static int mfill_atomic_pte_copy(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) { void *kaddr; int ret; + + kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(kaddr, (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(kaddr); + + if (ret) + return -EFAULT; + + flush_dcache_folio(folio); + return ret; +} + +static int mfill_copy_folio_retry(struct mfill_state *state, + struct folio *folio) +{ + const struct vm_uffd_ops *orig_ops = vma_uffd_ops(state->vma); + unsigned long src_addr = state->src_addr; + void *kaddr; + int err; + + /* retry copying with mm_lock dropped */ + mfill_put_vma(state); + + kaddr = kmap_local_folio(folio, 0); + err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); + kunmap_local(kaddr); + if (unlikely(err)) + return -EFAULT; + + flush_dcache_folio(folio); + + /* reget VMA and PMD, they could change underneath us */ + err = mfill_get_vma(state); + if (err) + return err; + + /* + * The VMA type may have changed while the lock was dropped + * (e.g. replaced with a hugetlb mapping), making the caller's + * ops pointer stale. + */ + if (vma_uffd_ops(state->vma) != orig_ops) + return -EAGAIN; + + err = mfill_establish_pmd(state); + if (err) + return err; + + return 0; +} + +static int __mfill_atomic_pte(struct mfill_state *state, + const struct vm_uffd_ops *ops) +{ + unsigned long dst_addr = state->dst_addr; + unsigned long src_addr = state->src_addr; + uffd_flags_t flags = state->flags; struct folio *folio; + int ret; - if (!*foliop) { - ret = -ENOMEM; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr); - if (!folio) - goto out; + folio = ops->alloc_folio(state->vma, state->dst_addr); + if (!folio) + return -ENOMEM; - kaddr = kmap_local_folio(folio, 0); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + ret = mfill_copy_folio_locked(folio, src_addr); /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. + * Fallback to copy_from_user outside mmap_lock. + * If retry is successful, mfill_copy_folio_locked() returns + * with locks retaken by mfill_get_vma(). + * If there was an error, we must mfill_put_vma() anyway and it + * will take care of unlocking if needed. */ - pagefault_disable(); - ret = copy_from_user(kaddr, (const void __user *) src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(kaddr); - - /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - ret = -ENOENT; - *foliop = folio; - /* don't free the page */ - goto out; + ret = mfill_copy_folio_retry(state, folio); + if (ret) + goto err_folio_put; } - - flush_dcache_folio(folio); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + clear_user_highpage(&folio->page, state->dst_addr); } else { - folio = *foliop; - *foliop = NULL; + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); } /* @@ -299,63 +522,65 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, */ __folio_mark_uptodate(folio); - ret = -ENOMEM; - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_release; + if (ops->filemap_add) { + ret = ops->filemap_add(folio, state->vma, state->dst_addr); + if (ret) + goto err_folio_put; + } - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); + ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, + &folio->page, flags); if (ret) - goto out_release; -out: - return ret; -out_release: + goto err_filemap_remove; + + return 0; + +err_filemap_remove: + if (ops->filemap_remove) + ops->filemap_remove(folio, state->vma); +err_folio_put: folio_put(folio); - goto out; + return ret; } -static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_copy(struct mfill_state *state) { - struct folio *folio; - int ret = -ENOMEM; - - folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); - if (!folio) - return ret; - - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_put; + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * zeroing out the folio become visible before mapping the page - * using set_pte_at(). See do_anonymous_page(). + * The normal page fault path for a MAP_PRIVATE mapping in a + * file-backed VMA will invoke the fault, fill the hole in the file and + * COW it right away. The result generates plain anonymous memory. + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll + * generate anonymous memory directly without actually filling the + * hole. For the MAP_PRIVATE case the robustness check only happens in + * the pagetable (to verify it's still none) and not in the page cache. */ - __folio_mark_uptodate(folio); + if (!(state->vma->vm_flags & VM_SHARED)) + ops = &anon_uffd_ops; - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, 0); - if (ret) - goto out_put; + return __mfill_atomic_pte(state, ops); +} - return 0; -out_put: - folio_put(folio); - return ret; +static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); + + return __mfill_atomic_pte(state, ops); } -static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_zeropage(struct mfill_state *state) { + struct vm_area_struct *dst_vma = state->vma; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; - if (mm_forbids_zeropage(dst_vma->vm_mm)) - return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + if (mm_forbids_zeropage(dst_vma->vm_mm) || + (dst_vma->vm_flags & VM_SHARED)) + return mfill_atomic_pte_zeroed_folio(state); _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); @@ -381,28 +606,29 @@ out: } /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mfill_atomic_pte_continue(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_continue(struct mfill_state *state) { - struct inode *inode = file_inode(dst_vma->vm_file); + struct vm_area_struct *dst_vma = state->vma; + const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); + unsigned long dst_addr = state->dst_addr; pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct inode *inode = file_inode(dst_vma->vm_file); + uffd_flags_t flags = state->flags; + pmd_t *dst_pmd = state->pmd; struct folio *folio; struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret == -ENOENT) - ret = -EFAULT; - if (ret) - goto out; - if (!folio) { - ret = -EFAULT; - goto out; + if (!ops) { + VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); + return -EOPNOTSUPP; } + folio = ops->get_folio_noalloc(inode, pgoff); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (IS_ERR_OR_NULL(folio)) + return -EFAULT; + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; @@ -410,30 +636,28 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, flags); + page, flags); if (ret) goto out_release; - folio_unlock(folio); - ret = 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ -static int mfill_atomic_pte_poison(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_poison(struct mfill_state *state) { - int ret; + struct vm_area_struct *dst_vma = state->vma; struct mm_struct *dst_mm = dst_vma->vm_mm; + unsigned long dst_addr = state->dst_addr; + pmd_t *dst_pmd = state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; + int ret; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); ret = -EAGAIN; @@ -462,27 +686,6 @@ out: return ret; } -static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - pgd = pgd_offset(mm, address); - p4d = p4d_alloc(mm, pgd, address); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, address); - if (!pud) - return NULL; - /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. - */ - return pmd_alloc(mm, pud, address); -} - #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is @@ -657,48 +860,21 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ -static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { - ssize_t err; - - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { - return mfill_atomic_pte_continue(dst_pmd, dst_vma, - dst_addr, flags); - } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { - return mfill_atomic_pte_poison(dst_pmd, dst_vma, - dst_addr, flags); - } - - /* - * The normal page fault path for a shmem will invoke the - * fault, fill the hole in the file and COW it right away. The - * result generates plain anonymous memory. So when we are - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll - * generate anonymous memory directly without actually filling - * the hole. For the MAP_PRIVATE case the robustness check - * only happens in the pagetable (to verify it's still none) - * and not in the radix tree. - */ - if (!(dst_vma->vm_flags & VM_SHARED)) { - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err = mfill_atomic_pte_copy(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - else - err = mfill_atomic_pte_zeropage(dst_pmd, - dst_vma, dst_addr); - } else { - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - } - - return err; + uffd_flags_t flags = state->flags; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + return mfill_atomic_pte_continue(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) + return mfill_atomic_pte_poison(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + return mfill_atomic_pte_copy(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) + return mfill_atomic_pte_zeropage(state); + + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); + return -EOPNOTSUPP; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, @@ -707,13 +883,17 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long len, uffd_flags_t flags) { - struct mm_struct *dst_mm = ctx->mm; - struct vm_area_struct *dst_vma; + struct mfill_state state = (struct mfill_state){ + .ctx = ctx, + .dst_start = dst_start, + .src_start = src_start, + .flags = flags, + .len = len, + .src_addr = src_start, + .dst_addr = dst_start, + }; + long copied = 0; ssize_t err; - pmd_t *dst_pmd; - unsigned long src_addr, dst_addr; - long copied; - struct folio *folio; /* * Sanitize the command parameters: @@ -725,125 +905,35 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(src_start + len <= src_start); VM_WARN_ON_ONCE(dst_start + len <= dst_start); - src_addr = src_start; - dst_addr = dst_start; - copied = 0; - folio = NULL; -retry: - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); - if (IS_ERR(dst_vma)) { - err = PTR_ERR(dst_vma); + err = mfill_get_vma(&state); + if (err) goto out; - } - - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - down_read(&ctx->map_changing_lock); - err = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) - goto out_unlock; - - err = -EINVAL; - /* - * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but - * it will overwrite vm_ops, so vma_is_anonymous must return false. - */ - if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && - dst_vma->vm_flags & VM_SHARED)) - goto out_unlock; - - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) - goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ - if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + if (is_vm_hugetlb_page(state.vma)) + return mfill_atomic_hugetlb(ctx, state.vma, dst_start, src_start, len, flags); - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; - - while (src_addr < src_start + len) { - pmd_t dst_pmdval; - - VM_WARN_ON_ONCE(dst_addr >= dst_start + len); + while (state.src_addr < src_start + len) { + VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); - dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); - if (unlikely(!dst_pmd)) { - err = -ENOMEM; + err = mfill_establish_pmd(&state); + if (err) break; - } - dst_pmdval = pmdp_get_lockless(dst_pmd); - if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd))) { - err = -ENOMEM; - break; - } - dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is THP don't override it and just be strict. - * (This includes the case where the PMD used to be THP and - * changed back to none after __pte_alloc().) - */ - if (unlikely(!pmd_present(dst_pmdval) || - pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } - if (unlikely(pmd_bad(dst_pmdval))) { - err = -EFAULT; - break; - } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ - err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, - src_addr, flags, &folio); + err = mfill_atomic_pte(&state); cond_resched(); - if (unlikely(err == -ENOENT)) { - void *kaddr; - - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - VM_WARN_ON_ONCE(!folio); - - kaddr = kmap_local_folio(folio, 0); - err = copy_from_user(kaddr, - (const void __user *) src_addr, - PAGE_SIZE); - kunmap_local(kaddr); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - flush_dcache_folio(folio); - goto retry; - } else - VM_WARN_ON_ONCE(folio); - if (!err) { - dst_addr += PAGE_SIZE; - src_addr += PAGE_SIZE; + state.dst_addr += PAGE_SIZE; + state.src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) @@ -853,12 +943,8 @@ retry: break; } -out_unlock: - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); + mfill_put_vma(&state); out: - if (folio) - folio_put(folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); @@ -1938,6 +2024,38 @@ out: return moved ? moved : err; } +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + const struct vm_uffd_ops *ops = vma_uffd_ops(vma); + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + vm_flags &= __VM_UFFD_FLAGS; + + /* + * If WP is the only mode enabled and context is wp async, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + /* For any other mode reject VMAs that don't implement vm_uffd_ops */ + if (!ops) + return false; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then only anonymous memory is supported + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) + return false; + + return ops->can_userfault(vma, vm_flags); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { diff --git a/mm/util.c b/mm/util.c index f063fd4de1e8..3cc949a0b7ed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1232,7 +1232,7 @@ int __compat_vma_mmap(struct vm_area_desc *desc, /* Update the VMA from the descriptor. */ compat_set_vma_from_desc(vma, desc); /* Complete any specified mmap actions. */ - return mmap_action_complete(vma, &desc->action); + return mmap_action_complete(vma, &desc->action, /*is_compat=*/true); } EXPORT_SYMBOL(__compat_vma_mmap); @@ -1281,16 +1281,6 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) } EXPORT_SYMBOL(compat_vma_mmap); -int __vma_check_mmap_hook(struct vm_area_struct *vma) -{ - /* vm_ops->mapped is not valid if mmap() is specified. */ - if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL(__vma_check_mmap_hook); - static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { @@ -1399,7 +1389,8 @@ static int call_vma_mapped(struct vm_area_struct *vma) } static int mmap_action_finish(struct vm_area_struct *vma, - struct mmap_action *action, int err) + struct mmap_action *action, int err, + bool is_compat) { size_t len; @@ -1410,8 +1401,12 @@ static int mmap_action_finish(struct vm_area_struct *vma, /* do_munmap() might take rmap lock, so release if held. */ maybe_rmap_unlock_action(vma, action); - if (!err) - return 0; + /* + * If this is invoked from the compatibility layer, post-mmap() hook + * logic will handle cleanup for us. + */ + if (!err || is_compat) + return err; /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1461,13 +1456,15 @@ EXPORT_SYMBOL(mmap_action_prepare); * mmap_action_complete - Execute VMA descriptor action. * @vma: The VMA to perform the action upon. * @action: The action to perform. + * @is_compat: Is this being invoked from the compatibility layer? * * Similar to mmap_action_prepare(). * - * Return: 0 on success, or error, at which point the VMA will be unmapped. + * Return: 0 on success, or error, at which point the VMA will be unmapped if + * !@is_compat. */ int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, bool is_compat) { int err = 0; @@ -1488,7 +1485,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #else @@ -1510,7 +1507,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) EXPORT_SYMBOL(mmap_action_prepare); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, + bool is_compat) { int err = 0; @@ -1527,7 +1525,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #endif @@ -2780,7 +2780,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = mmap_action_complete(vma, &desc.action); + error = mmap_action_complete(vma, &desc.action, + /*is_compat=*/false); if (error) return error; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b31b208f6ecb..c31a8615a832 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4361,7 +4361,7 @@ need_realloc: return NULL; if (p) { - memcpy(n, p, old_size); + memcpy(n, p, min(size, old_size)); vfree(p); } @@ -5416,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct vmap_node *vn; + guard(mutex)(&vmap_purge_lock); for_each_vmap_node(vn) decay_va_pool_node(vn, true); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4bf091b1c8af..bd1b1aa12581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif -/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to - * and including the specified highidx - * @zone: The current zone in the iterator - * @pgdat: The pgdat which node_zones are being iterated - * @idx: The index variable - * @highidx: The index of the highest zone to return - * - * This macro iterates through all managed zones up to and including the specified highidx. - * The zone iterator enters an invalid state after macro call and must be reinitialized - * before it can be used again. - */ -#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ - for ((idx) = 0, (zone) = (pgdat)->node_zones; \ - (idx) <= (highidx); \ - (idx)++, (zone)++) \ - if (!managed_zone(zone)) \ - continue; \ - else - static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -409,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zone_idx) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; @@ -1831,7 +1811,7 @@ bool folio_isolate_lru(struct folio *folio) folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); ret = true; } @@ -1885,24 +1865,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * - * Returns the number of pages moved to the given lruvec. + * Returns the number of pages moved to the appropriate lruvec. + * + * Note: The caller must not hold any lruvec lock. */ -static unsigned int move_folios_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct list_head *list) { int nr_pages, nr_moved = 0; + struct lruvec *lruvec = NULL; struct folio_batch free_folios; folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); + lruvec = folio_lruvec_relock_irq(folio, lruvec); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); folio_putback_lru(folio); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; continue; } @@ -1924,20 +1907,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); + lruvec = NULL; } continue; } - /* - * All pages were isolated from the same lruvec (and isolation - * inhibits memcg migration). - */ - VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; @@ -1945,11 +1923,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } + if (lruvec) + lruvec_unlock_irq(lruvec); + if (free_folios.nr) { - spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); - spin_lock_irq(&lruvec->lru_lock); } return nr_moved; @@ -1998,7 +1977,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); @@ -2008,7 +1987,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, item, nr_scanned); mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (nr_taken == 0) return 0; @@ -2016,16 +1995,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, lruvec_memcg(lruvec)); - spin_lock_irq(&lruvec->lru_lock); - move_folios_to_lru(lruvec, &folio_list); + move_folios_to_lru(&folio_list); mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), stat.nr_demoted); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); mod_lruvec_state(lruvec, item, nr_reclaimed); mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2104,7 +2083,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2113,7 +2092,7 @@ static void shrink_active_list(unsigned long nr_to_scan, mod_lruvec_state(lruvec, PGREFILL, nr_scanned); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); while (!list_empty(&l_hold)) { struct folio *folio; @@ -2162,16 +2141,14 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move folios back to the lru list. */ - spin_lock_irq(&lruvec->lru_lock); - - nr_activate = move_folios_to_lru(lruvec, &l_active); - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); + nr_activate = move_folios_to_lru(&l_active); + nr_deactivate = move_folios_to_lru(&l_inactive); - __count_vm_events(PGDEACTIVATE, nr_deactivate); + count_vm_events(PGDEACTIVATE, nr_deactivate); count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - + lruvec_lock_irq(lruvec); lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); @@ -2886,8 +2863,9 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) return NULL; clear_bit(key, &mm->lru_gen.bitmap); + mmgrab(mm); - return mmget_not_zero(mm) ? mm : NULL; + return mm; } void lru_gen_add_mm(struct mm_struct *mm) @@ -3087,7 +3065,7 @@ done: reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) - mmput_async(*iter); + mmdrop(*iter); *iter = mm; @@ -3442,8 +3420,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, if (folio_nid(folio) != pgdat->node_id) return NULL; + rcu_read_lock(); if (folio_memcg(folio) != memcg) - return NULL; + folio = NULL; + rcu_read_unlock(); return folio; } @@ -3803,9 +3783,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) } if (walk->batched) { - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); reset_batch_size(walk); - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -3965,7 +3945,7 @@ restart: if (seq < READ_ONCE(lrugen->max_seq)) return false; - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -3980,7 +3960,7 @@ restart: if (inc_min_seq(lruvec, type, swappiness)) continue; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); goto restart; } @@ -4015,7 +3995,7 @@ restart: /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); return success; } @@ -4213,12 +4193,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - DEFINE_MAX_SEQ(lruvec); - int gen = lru_gen_from_seq(max_seq); + struct lruvec *lruvec; + struct lru_gen_mm_state *mm_state; + unsigned long max_seq; + int gen; lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4253,6 +4233,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) } } + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + max_seq = READ_ONCE((lruvec)->lrugen.max_seq); + gen = lru_gen_from_seq(max_seq); + mm_state = get_mm_state(lruvec); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4302,6 +4288,8 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + mem_cgroup_put(memcg); + return true; } @@ -4437,6 +4425,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + return false; + } + + return true; +} + +static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg, + struct lruvec *lruvec) +{ + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + int swappiness = mem_cgroup_swappiness(memcg); + DEFINE_MAX_SEQ(lruvec); + bool success = false; + + /* + * We are not iterating the mm_list here, updating mm_state->seq is just + * to make mm walkers work properly. + */ + if (mm_state) { + spin_lock(&mm_list->lock); + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + if (max_seq > mm_state->seq) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + success = true; + } + spin_unlock(&mm_list->lock); + } else { + success = true; + } + + if (success) + inc_max_seq(lruvec, max_seq, swappiness); +} + +/* + * We need to ensure that the folios of child memcg can be reparented to the + * same gen of the parent memcg, so the gens of the parent memcg needed be + * incremented to the MAX_NR_GENS before reparenting. + */ +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + while (get_nr_gens(lruvec, type) < MAX_NR_GENS) { + try_to_inc_max_seq_nowalk(memcg, lruvec); + cond_resched(); + } + } +} + +/* + * Compared to traditional LRU, MGLRU faces the following challenges: + * + * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the + * number of generations of the parent and child memcg may be different, + * so we cannot simply transfer MGLRU folios in the child memcg to the + * parent memcg as we did for traditional LRU folios. + * 2. The generation information is stored in folio->flags, but we cannot + * traverse these folios while holding the lru lock, otherwise it may + * cause softlockup. + * 3. In walk_update_folio(), the gen of folio and corresponding lru size + * may be updated, but the folio is not immediately moved to the + * corresponding lru list. Therefore, there may be folios of different + * generations on an LRU list. + * 4. In lru_gen_del_folio(), the generation to which the folio belongs is + * found based on the generation information in folio->flags, and the + * corresponding LRU size will be updated. Therefore, we need to update + * the lru size correctly during reparenting, otherwise the lru size may + * be updated incorrectly in lru_gen_del_folio(). + * + * Finally, we choose a compromise method, which is to splice the lru list in + * the child memcg to the lru list of the same generation in the parent memcg + * during reparenting. + * + * The same generation has different meanings in the parent and child memcg, + * so this compromise method will cause the LRU inversion problem. But as the + * system runs, this problem will be fixed automatically. + */ +static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec, + int zone, int type) +{ + struct lru_gen_folio *child_lrugen, *parent_lrugen; + enum lru_list lru = type * LRU_INACTIVE_FILE; + int i; + + child_lrugen = &child_lruvec->lrugen; + parent_lrugen = &parent_lruvec->lrugen; + + for (i = 0; i < get_nr_gens(child_lruvec, type); i++) { + int gen = lru_gen_from_seq(child_lrugen->max_seq - i); + long nr_pages = child_lrugen->nr_pages[gen][type][zone]; + int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0; + int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0; + + /* Assuming that child pages are colder than parent pages */ + list_splice_tail_init(&child_lrugen->folios[gen][type][zone], + &parent_lrugen->folios[gen][type][zone]); + + WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0); + WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone], + parent_lrugen->nr_pages[gen][type][zone] + nr_pages); + + if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) { + __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages); + __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages); + } + } +} + +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ + struct lruvec *child_lruvec, *parent_lruvec; + int type, zid; + struct zone *zone; + enum lru_list lru; + + child_lruvec = get_lruvec(memcg, nid); + parent_lruvec = get_lruvec(parent, nid); + + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) + for (type = 0; type < ANON_AND_FILE; type++) + __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type); + + for_each_lru(lru) { + for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) { + unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid); + + mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size); + } + } +} + #endif /* CONFIG_MEMCG */ /****************************************************************************** @@ -4630,7 +4760,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, static int get_tier_idx(struct lruvec *lruvec, int type) { int tier; - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; /* * To leave a margin for fluctuations, use a larger gain factor (2:3). @@ -4649,7 +4779,7 @@ static int get_tier_idx(struct lruvec *lruvec, int type) static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - struct ctrl_pos sp, pv; + struct ctrl_pos sp, pv = {}; if (swappiness <= MIN_SWAPPINESS + 1) return LRU_GEN_FILE; @@ -4707,7 +4837,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); @@ -4716,7 +4846,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); if (list_empty(&list)) return scanned; @@ -4749,14 +4879,14 @@ retry: set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } - spin_lock_irq(&lruvec->lru_lock); - - move_folios_to_lru(lruvec, &list); + move_folios_to_lru(&list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; + lruvec_lock_irq(lruvec); reset_batch_size(walk); + lruvec_unlock_irq(lruvec); } mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), @@ -4766,8 +4896,6 @@ retry: mod_lruvec_state(lruvec, item, reclaimed); mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - list_splice_init(&clean, &list); if (!list_empty(&list)) { @@ -4843,10 +4971,6 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) int i; enum zone_watermarks mark; - /* don't abort memcg reclaim to ensure fairness */ - if (!root_reclaim(sc)) - return false; - if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) return true; @@ -4900,9 +5024,24 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * If too many file cache in the coldest generation can't be evicted * due to being dirty, wake up the flusher. */ - if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5196,7 +5335,7 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); @@ -5204,12 +5343,12 @@ static void lru_gen_change_state(bool enabled) lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); cond_resched(); - spin_lock_irq(&lruvec->lru_lock); + lruvec_lock_irq(lruvec); } - spin_unlock_irq(&lruvec->lru_lock); + lruvec_unlock_irq(lruvec); } cond_resched(); @@ -7898,7 +8037,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - unlock_page_lruvec_irq(lruvec); + lruvec_unlock_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } diff --git a/mm/vmstat.c b/mm/vmstat.c index c360c1b29ac9..f534972f517d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2141,7 +2141,7 @@ static void vmstat_shepherd(struct work_struct *w) if (cpu_is_isolated(cpu)) continue; - if (!delayed_work_pending(dw) && need_update(cpu)) + if (!work_busy(&dw->work) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } diff --git a/mm/workingset.c b/mm/workingset.c index 37a94979900f..07e6836d0502 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -244,12 +244,15 @@ static void *lru_gen_eviction(struct folio *folio) int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; struct pglist_data *pgdat = folio_pgdat(folio); + unsigned short memcg_id; BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); + rcu_read_lock(); + memcg = folio_memcg(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); @@ -257,8 +260,10 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + memcg_id = mem_cgroup_private_id(memcg); + rcu_read_unlock(); - return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type); + return pack_shadow(memcg_id, pgdat, token, workingset, type); } /* @@ -541,7 +546,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); - struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; @@ -564,14 +568,12 @@ void workingset_refault(struct folio *folio, void *shadow) * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - + memcg = get_mem_cgroup_from_folio(folio); + lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio)); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) - return; + goto out; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -587,6 +589,8 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } +out: + mem_cgroup_put(memcg); } /** @@ -599,8 +603,11 @@ void workingset_activation(struct folio *folio) * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ - if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) { + rcu_read_lock(); workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); + rcu_read_unlock(); + } } /* @@ -684,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) - pages += lruvec_page_state_local(lruvec, - NR_LRU_BASE + i); + pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); + pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( diff --git a/mm/zswap.c b/mm/zswap.c index 0823cadd02b6..4b5149173b0e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -242,6 +242,34 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); +static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx) +{ + if (!acomp_ctx) + return; + + /* + * If there was an error in allocating @acomp_ctx->req, it + * would be set to NULL. + */ + if (acomp_ctx->req) + acomp_request_free(acomp_ctx->req); + + acomp_ctx->req = NULL; + + /* + * We have to handle both cases here: an error pointer return from + * crypto_alloc_acomp_node(); and a) NULL initialization by zswap, or + * b) NULL assignment done in a previous call to acomp_ctx_free(). + */ + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + + acomp_ctx->acomp = NULL; + + kfree(acomp_ctx->buffer); + acomp_ctx->buffer = NULL; +} + static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; @@ -263,19 +291,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor) strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + /* Many things rely on the zero-initialization. */ + pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx, + GFP_KERNEL | __GFP_ZERO); if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } - for_each_possible_cpu(cpu) - mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); - + /* + * This is serialized against CPU hotplug operations. Hence, cores + * cannot be offlined until this finishes. + */ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + /* + * cpuhp_state_add_instance() will not cleanup on failure since + * we don't register a hotunplug callback. + */ if (ret) - goto error; + goto cpuhp_add_fail; /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool @@ -292,6 +328,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor) ref_fail: cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + +cpuhp_add_fail: + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -322,9 +362,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { + int cpu; + zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + + for_each_possible_cpu(cpu) + acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu)); + free_percpu(pool->acomp_ctx); zs_destroy_pool(pool->zs_pool); @@ -664,8 +710,10 @@ void zswap_folio_swapin(struct folio *folio) struct lruvec *lruvec; if (folio) { + rcu_read_lock(); lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); + rcu_read_unlock(); } } @@ -736,39 +784,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp = NULL; - struct acomp_req *req = NULL; - u8 *buffer = NULL; - int ret; + int ret = -ENOMEM; - buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); - if (!buffer) { - ret = -ENOMEM; - goto fail; + /* + * To handle cases where the CPU goes through online-offline-online + * transitions, we return if the acomp_ctx has already been initialized. + */ + if (acomp_ctx->acomp) { + WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp)); + return 0; } - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return ret; + + /* + * In case of an error, crypto_alloc_acomp_node() returns an + * error pointer, never NULL. + */ + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp_ctx->acomp)) { pr_err("could not alloc crypto acomp %s : %pe\n", - pool->tfm_name, acomp); - ret = PTR_ERR(acomp); + pool->tfm_name, acomp_ctx->acomp); + ret = PTR_ERR(acomp_ctx->acomp); goto fail; } - req = acomp_request_alloc(acomp); - if (!req) { + /* acomp_request_alloc() returns NULL in case of an error. */ + acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp); + if (!acomp_ctx->req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - ret = -ENOMEM; goto fail; } - /* - * Only hold the mutex after completing allocations, otherwise we may - * recurse into zswap through reclaim and attempt to hold the mutex - * again resulting in a deadlock. - */ - mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); /* @@ -776,80 +826,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking. */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->buffer = buffer; - acomp_ctx->acomp = acomp; - acomp_ctx->req = req; - mutex_unlock(&acomp_ctx->mutex); + mutex_init(&acomp_ctx->mutex); return 0; fail: - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); + acomp_ctx_free(acomp_ctx); return ret; } -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct acomp_req *req; - struct crypto_acomp *acomp; - u8 *buffer; - - if (IS_ERR_OR_NULL(acomp_ctx)) - return 0; - - mutex_lock(&acomp_ctx->mutex); - req = acomp_ctx->req; - acomp = acomp_ctx->acomp; - buffer = acomp_ctx->buffer; - acomp_ctx->req = NULL; - acomp_ctx->acomp = NULL; - acomp_ctx->buffer = NULL; - mutex_unlock(&acomp_ctx->mutex); - - /* - * Do the actual freeing after releasing the mutex to avoid subtle - * locking dependencies causing deadlocks. - */ - if (!IS_ERR_OR_NULL(req)) - acomp_request_free(req); - if (!IS_ERR_OR_NULL(acomp)) - crypto_free_acomp(acomp); - kfree(buffer); - - return 0; -} - -static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) -{ - struct crypto_acomp_ctx *acomp_ctx; - - for (;;) { - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - if (likely(acomp_ctx->req)) - return acomp_ctx; - /* - * It is possible that we were migrated to a different CPU after - * getting the per-CPU ctx but before the mutex was acquired. If - * the old CPU got offlined, zswap_cpu_comp_dead() could have - * already freed ctx->req (among other things) and set it to - * NULL. Just try again on the new CPU that we ended up on. - */ - mutex_unlock(&acomp_ctx->mutex); - } -} - -static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) -{ - mutex_unlock(&acomp_ctx->mutex); -} - static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -862,7 +849,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, u8 *dst; bool mapped = false; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); @@ -893,11 +882,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, * to the active LRU list in the case. */ if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + rcu_read_lock(); if (!mem_cgroup_zswap_writeback_enabled( folio_memcg(page_folio(page)))) { + rcu_read_unlock(); comp_ret = comp_ret ? comp_ret : -EINVAL; goto unlock; } + rcu_read_unlock(); comp_ret = 0; dlen = PAGE_SIZE; dst = kmap_local_page(page); @@ -925,7 +917,7 @@ unlock: else if (alloc_ret) zswap_reject_alloc_fail++; - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); return comp_ret == 0 && alloc_ret == 0; } @@ -937,7 +929,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; int ret = 0, dlen; - acomp_ctx = acomp_ctx_get_cpu_lock(pool); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length); /* zswap entries of length PAGE_SIZE are not compressed. */ @@ -962,7 +955,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) } zs_obj_read_sg_end(pool->zs_pool, entry->handle); - acomp_ctx_put_unlock(acomp_ctx); + mutex_unlock(&acomp_ctx->mutex); if (!ret && dlen == PAGE_SIZE) return true; @@ -1782,7 +1775,7 @@ static int zswap_setup(void) ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, - zswap_cpu_comp_dead); + NULL); if (ret) goto hp_fail; |
