summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/Kconfig.debug11
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/compaction.c43
-rw-r--r--mm/damon/core.c101
-rw-r--r--mm/damon/lru_sort.c85
-rw-r--r--mm/damon/reclaim.c85
-rw-r--r--mm/damon/stat.c35
-rw-r--r--mm/damon/sysfs-schemes.c24
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/hugetlb_cma.c1
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memblock.c235
-rw-r--r--mm/memcontrol-v1.c31
-rw-r--r--mm/memcontrol-v1.h7
-rw-r--r--mm/memcontrol.c641
-rw-r--r--mm/memfd_luo.c59
-rw-r--r--mm/mempolicy.c31
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/migrate_device.c6
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c25
-rw-r--r--mm/mprotect.c218
-rw-r--r--mm/page-writeback.c16
-rw-r--r--mm/page_alloc.c61
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/page_table_check.c15
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/shmem.c176
-rw-r--r--mm/shrinker.c6
-rw-r--r--mm/slub.c28
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c59
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/userfaultfd.c692
-rw-r--r--mm/util.c36
-rw-r--r--mm/vma.c3
-rw-r--r--mm/vmalloc.c3
-rw-r--r--mm/vmscan.c303
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c30
-rw-r--r--mm/zswap.c187
47 files changed, 2088 insertions, 1258 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0a43bb80df4f..e8bf1e9e6ad9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -570,6 +570,7 @@ config SPLIT_PTE_PTLOCKS
depends on !ARM || CPU_CACHE_VIPT
depends on !PARISC || PA20
depends on !SPARC32
+ depends on !UML
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 7638d75b27db..91b3e027b753 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -297,6 +297,17 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
If unsure, say Y.
+config DEBUG_KMEMLEAK_VERBOSE
+ bool "Default kmemleak to verbose mode"
+ depends on DEBUG_KMEMLEAK_AUTO_SCAN
+ help
+ Say Y here to have kmemleak print unreferenced object details
+ (backtrace, hex dump, address) to dmesg when new memory leaks are
+ detected during automatic scanning. This can also be toggled at
+ runtime via /sys/module/kmemleak/parameters/verbose.
+
+ If unsure, say N.
+
config PER_VMA_LOCK_STATS
bool "Statistics for per-vma locks"
depends on PER_VMA_LOCK
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7a18fa6c7272..cecbcf9060a6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -618,12 +618,13 @@ static void cgwb_release_workfn(struct work_struct *work)
wb_shutdown(wb);
css_put(wb->memcg_css);
- css_put(wb->blkcg_css);
- mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
blkcg_unpin_online(wb->blkcg_css);
+ css_put(wb->blkcg_css);
+ mutex_unlock(&wb->bdi->cgwb_release_mutex);
+
fprop_local_destroy_percpu(&wb->memcg_completions);
spin_lock_irq(&cgwb_lock);
diff --git a/mm/compaction.c b/mm/compaction.c
index 1e8f8eca318c..3648ce22c807 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
+static struct lruvec *
+compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags,
+ struct compact_control *cc)
+{
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
+ compact_lock_irqsave(&lruvec->lru_lock, flags, cc);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
+
+ return lruvec;
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. The lock should be periodically unlocked to avoid
@@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;
struct lruvec *locked = NULL;
struct folio *folio = NULL;
@@ -913,7 +931,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -964,7 +982,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/* for alloc_contig case */
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -1053,7 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(page_has_movable_ops(page)) &&
!PageMovableOpsIsolated(page)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!folio_test_clear_lru(folio))
goto isolate_fail_put;
- lruvec = folio_lruvec(folio);
+ if (locked)
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
- if (lruvec != locked) {
+ if (lruvec != locked || !locked) {
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
- compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, folio);
-
/*
* Try get exclusive access under lock. If marked for
* skip, the scan is aborted unless the current context
@@ -1226,7 +1243,7 @@ isolate_success_no_list:
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
folio_put(folio);
@@ -1242,7 +1259,7 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
putback_movable_pages(&cc->migratepages);
@@ -1274,7 +1291,7 @@ isolate_fail:
isolate_abort:
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
if (folio) {
folio_set_lru(folio);
folio_put(folio);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7f04fc3f8c8c..3dbbbfdeff71 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1477,6 +1477,11 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive)
int i;
int err = 0;
+ for (i = 0; i < nr_ctxs; i++) {
+ if (!is_power_of_2(ctxs[i]->min_region_sz))
+ return -EINVAL;
+ }
+
mutex_lock(&damon_lock);
if ((exclusive && nr_running_ctxs) ||
(!exclusive && running_exclusive_ctxs)) {
@@ -1573,35 +1578,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx)
return pid;
}
-/*
- * damon_call_handle_inactive_ctx() - handle DAMON call request that added to
- * an inactive context.
- * @ctx: The inactive DAMON context.
- * @control: Control variable of the call request.
- *
- * This function is called in a case that @control is added to @ctx but @ctx is
- * not running (inactive). See if @ctx handled @control or not, and cleanup
- * @control if it was not handled.
- *
- * Returns 0 if @control was handled by @ctx, negative error code otherwise.
- */
-static int damon_call_handle_inactive_ctx(
- struct damon_ctx *ctx, struct damon_call_control *control)
-{
- struct damon_call_control *c;
-
- mutex_lock(&ctx->call_controls_lock);
- list_for_each_entry(c, &ctx->call_controls, list) {
- if (c == control) {
- list_del(&control->list);
- mutex_unlock(&ctx->call_controls_lock);
- return -EINVAL;
- }
- }
- mutex_unlock(&ctx->call_controls_lock);
- return 0;
-}
-
/**
* damon_call() - Invoke a given function on DAMON worker thread (kdamond).
* @ctx: DAMON context to call the function for.
@@ -1619,6 +1595,10 @@ static int damon_call_handle_inactive_ctx(
* synchronization. The return value of the function will be saved in
* &damon_call_control->return_code.
*
+ * Note that this function should be called only after damon_start() with the
+ * @ctx has succeeded. Otherwise, this function could fall into an indefinite
+ * wait.
+ *
* Return: 0 on success, negative error code otherwise.
*/
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
@@ -1629,10 +1609,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
INIT_LIST_HEAD(&control->list);
mutex_lock(&ctx->call_controls_lock);
+ if (ctx->call_controls_obsolete) {
+ mutex_unlock(&ctx->call_controls_lock);
+ return -ECANCELED;
+ }
list_add_tail(&control->list, &ctx->call_controls);
mutex_unlock(&ctx->call_controls_lock);
- if (!damon_is_running(ctx))
- return damon_call_handle_inactive_ctx(ctx, control);
if (control->repeat)
return 0;
wait_for_completion(&control->completion);
@@ -1660,6 +1642,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
* passed at least one &damos->apply_interval_us, kdamond marks the request as
* completed so that damos_walk() can wakeup and return.
*
+ * Note that this function should be called only after damon_start() with the
+ * @ctx has succeeded. Otherwise, this function could fall into an indefinite
+ * wait.
+ *
* Return: 0 on success, negative error code otherwise.
*/
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
@@ -1667,19 +1653,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
init_completion(&control->completion);
control->canceled = false;
mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control_obsolete) {
+ mutex_unlock(&ctx->walk_control_lock);
+ return -ECANCELED;
+ }
if (ctx->walk_control) {
mutex_unlock(&ctx->walk_control_lock);
return -EBUSY;
}
ctx->walk_control = control;
mutex_unlock(&ctx->walk_control_lock);
- if (!damon_is_running(ctx)) {
- mutex_lock(&ctx->walk_control_lock);
- if (ctx->walk_control == control)
- ctx->walk_control = NULL;
- mutex_unlock(&ctx->walk_control_lock);
- return -EINVAL;
- }
wait_for_completion(&control->completion);
if (control->canceled)
return -ECANCELED;
@@ -2239,12 +2222,24 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
#ifdef CONFIG_NUMA
+static bool invalid_mem_node(int nid)
+{
+ return nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY);
+}
+
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
struct sysinfo i;
__kernel_ulong_t numerator;
+ if (invalid_mem_node(goal->nid)) {
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ return 0;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ return 10000;
+ }
+
si_meminfo_node(&i, goal->nid);
if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
numerator = i.totalram - i.freeram;
@@ -2261,6 +2256,13 @@ static unsigned long damos_get_node_memcg_used_bp(
unsigned long used_pages, numerator;
struct sysinfo i;
+ if (invalid_mem_node(goal->nid)) {
+ if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
+ return 0;
+ else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
+ return 10000;
+ }
+
memcg = mem_cgroup_get_from_id(goal->memcg_id);
if (!memcg) {
if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
@@ -2387,7 +2389,8 @@ static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota)
/*
* Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
*/
-static void damos_set_effective_quota(struct damos_quota *quota)
+static void damos_set_effective_quota(struct damos_quota *quota,
+ struct damon_ctx *ctx)
{
unsigned long throughput;
unsigned long esz = ULONG_MAX;
@@ -2412,6 +2415,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
else
throughput = PAGE_SIZE * 1024;
esz = min(throughput * quota->ms, esz);
+ esz = max(ctx->min_region_sz, esz);
}
if (quota->sz && quota->sz < esz)
@@ -2448,11 +2452,12 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* First charge window */
if (!quota->total_charged_sz && !quota->charged_from) {
quota->charged_from = jiffies;
- damos_set_effective_quota(quota);
+ damos_set_effective_quota(quota, c);
}
/* New charge window starts */
- if (time_after_eq(jiffies, quota->charged_from +
+ if (!time_in_range_open(jiffies, quota->charged_from,
+ quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
if (damos_quota_is_set(quota) &&
quota->charged_sz >= quota->esz)
@@ -2462,7 +2467,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->charged_sz = 0;
if (trace_damos_esz_enabled())
cached_esz = quota->esz;
- damos_set_effective_quota(quota);
+ damos_set_effective_quota(quota, c);
if (trace_damos_esz_enabled() && quota->esz != cached_esz)
damos_trace_esz(c, s, quota);
}
@@ -2952,6 +2957,12 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
+ mutex_lock(&ctx->call_controls_lock);
+ ctx->call_controls_obsolete = false;
+ mutex_unlock(&ctx->call_controls_lock);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control_obsolete = false;
+ mutex_unlock(&ctx->walk_control_lock);
complete(&ctx->kdamond_started);
kdamond_init_ctx(ctx);
@@ -3062,7 +3073,13 @@ done:
damon_destroy_targets(ctx);
kfree(ctx->regions_score_histogram);
+ mutex_lock(&ctx->call_controls_lock);
+ ctx->call_controls_obsolete = true;
+ mutex_unlock(&ctx->call_controls_lock);
kdamond_call(ctx, true);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control_obsolete = true;
+ mutex_unlock(&ctx->walk_control_lock);
damos_walk_cancel(ctx);
pr_debug("kdamond (%d) finishes\n", current->pid);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 554559d72976..8494040b1ee4 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -161,15 +161,6 @@ module_param(monitor_region_end, ulong, 0600);
*/
static unsigned long addr_unit __read_mostly = 1;
-/*
- * PID of the DAMON thread
- *
- * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
- * Else, -1.
- */
-static int kdamond_pid __read_mostly = -1;
-module_param(kdamond_pid, int, 0400);
-
static struct damos_stat damon_lru_sort_hot_stat;
DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat,
lru_sort_tried_hot_regions, lru_sorted_hot_regions,
@@ -386,12 +377,8 @@ static int damon_lru_sort_turn(bool on)
{
int err;
- if (!on) {
- err = damon_stop(&ctx, 1);
- if (!err)
- kdamond_pid = -1;
- return err;
- }
+ if (!on)
+ return damon_stop(&ctx, 1);
err = damon_lru_sort_apply_parameters();
if (err)
@@ -400,9 +387,6 @@ static int damon_lru_sort_turn(bool on)
err = damon_start(&ctx, 1, true);
if (err)
return err;
- kdamond_pid = damon_kdamond_pid(ctx);
- if (kdamond_pid < 0)
- return kdamond_pid;
return damon_call(ctx, &call_control);
}
@@ -430,42 +414,83 @@ module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
MODULE_PARM_DESC(addr_unit,
"Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)");
+static bool damon_lru_sort_enabled(void)
+{
+ if (!ctx)
+ return false;
+ return damon_is_running(ctx);
+}
+
static int damon_lru_sort_enabled_store(const char *val,
const struct kernel_param *kp)
{
- bool is_enabled = enabled;
- bool enable;
int err;
- err = kstrtobool(val, &enable);
+ err = kstrtobool(val, &enabled);
if (err)
return err;
- if (is_enabled == enable)
+ if (damon_lru_sort_enabled() == enabled)
return 0;
/* Called before init function. The function will handle this. */
if (!damon_initialized())
- goto set_param_out;
+ return 0;
- err = damon_lru_sort_turn(enable);
- if (err)
- return err;
+ return damon_lru_sort_turn(enabled);
+}
-set_param_out:
- enabled = enable;
- return err;
+static int damon_lru_sort_enabled_load(char *buffer,
+ const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", damon_lru_sort_enabled() ? 'Y' : 'N');
}
static const struct kernel_param_ops enabled_param_ops = {
.set = damon_lru_sort_enabled_store,
- .get = param_get_bool,
+ .get = damon_lru_sort_enabled_load,
};
module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_LRU_SORT (default: disabled)");
+static int damon_lru_sort_kdamond_pid_store(const char *val,
+ const struct kernel_param *kp)
+{
+ /*
+ * kdamond_pid is read-only, but kernel command line could write it.
+ * Do nothing here.
+ */
+ return 0;
+}
+
+static int damon_lru_sort_kdamond_pid_load(char *buffer,
+ const struct kernel_param *kp)
+{
+ int kdamond_pid = -1;
+
+ if (ctx) {
+ kdamond_pid = damon_kdamond_pid(ctx);
+ if (kdamond_pid < 0)
+ kdamond_pid = -1;
+ }
+ return sprintf(buffer, "%d\n", kdamond_pid);
+}
+
+static const struct kernel_param_ops kdamond_pid_param_ops = {
+ .set = damon_lru_sort_kdamond_pid_store,
+ .get = damon_lru_sort_kdamond_pid_load,
+};
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400);
+
static int __init damon_lru_sort_init(void)
{
int err;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 86da14778658..fe7fce26cf6c 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -144,15 +144,6 @@ static unsigned long addr_unit __read_mostly = 1;
static bool skip_anon __read_mostly;
module_param(skip_anon, bool, 0600);
-/*
- * PID of the DAMON thread
- *
- * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
- * Else, -1.
- */
-static int kdamond_pid __read_mostly = -1;
-module_param(kdamond_pid, int, 0400);
-
static struct damos_stat damon_reclaim_stat;
DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
reclaim_tried_regions, reclaimed_regions, quota_exceeds);
@@ -288,12 +279,8 @@ static int damon_reclaim_turn(bool on)
{
int err;
- if (!on) {
- err = damon_stop(&ctx, 1);
- if (!err)
- kdamond_pid = -1;
- return err;
- }
+ if (!on)
+ return damon_stop(&ctx, 1);
err = damon_reclaim_apply_parameters();
if (err)
@@ -302,9 +289,6 @@ static int damon_reclaim_turn(bool on)
err = damon_start(&ctx, 1, true);
if (err)
return err;
- kdamond_pid = damon_kdamond_pid(ctx);
- if (kdamond_pid < 0)
- return kdamond_pid;
return damon_call(ctx, &call_control);
}
@@ -332,42 +316,83 @@ module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600);
MODULE_PARM_DESC(addr_unit,
"Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)");
+static bool damon_reclaim_enabled(void)
+{
+ if (!ctx)
+ return false;
+ return damon_is_running(ctx);
+}
+
static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
- bool is_enabled = enabled;
- bool enable;
int err;
- err = kstrtobool(val, &enable);
+ err = kstrtobool(val, &enabled);
if (err)
return err;
- if (is_enabled == enable)
+ if (damon_reclaim_enabled() == enabled)
return 0;
/* Called before init function. The function will handle this. */
if (!damon_initialized())
- goto set_param_out;
+ return 0;
- err = damon_reclaim_turn(enable);
- if (err)
- return err;
+ return damon_reclaim_turn(enabled);
+}
-set_param_out:
- enabled = enable;
- return err;
+static int damon_reclaim_enabled_load(char *buffer,
+ const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", damon_reclaim_enabled() ? 'Y' : 'N');
}
static const struct kernel_param_ops enabled_param_ops = {
.set = damon_reclaim_enabled_store,
- .get = param_get_bool,
+ .get = damon_reclaim_enabled_load,
};
module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
+static int damon_reclaim_kdamond_pid_store(const char *val,
+ const struct kernel_param *kp)
+{
+ /*
+ * kdamond_pid is read-only, but kernel command line could write it.
+ * Do nothing here.
+ */
+ return 0;
+}
+
+static int damon_reclaim_kdamond_pid_load(char *buffer,
+ const struct kernel_param *kp)
+{
+ int kdamond_pid = -1;
+
+ if (ctx) {
+ kdamond_pid = damon_kdamond_pid(ctx);
+ if (kdamond_pid < 0)
+ kdamond_pid = -1;
+ }
+ return sprintf(buffer, "%d\n", kdamond_pid);
+}
+
+static const struct kernel_param_ops kdamond_pid_param_ops = {
+ .set = damon_reclaim_kdamond_pid_store,
+ .get = damon_reclaim_kdamond_pid_load,
+};
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+module_param_cb(kdamond_pid, &kdamond_pid_param_ops, NULL, 0400);
+
static int __init damon_reclaim_init(void)
{
int err;
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 60351a719460..3951b762cbdd 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -19,14 +19,17 @@
static int damon_stat_enabled_store(
const char *val, const struct kernel_param *kp);
+static int damon_stat_enabled_load(char *buffer,
+ const struct kernel_param *kp);
+
static const struct kernel_param_ops enabled_param_ops = {
.set = damon_stat_enabled_store,
- .get = param_get_bool,
+ .get = damon_stat_enabled_load,
};
static bool enabled __read_mostly = IS_ENABLED(
CONFIG_DAMON_STAT_ENABLED_DEFAULT);
-module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+module_param_cb(enabled, &enabled_param_ops, NULL, 0600);
MODULE_PARM_DESC(enabled, "Enable of disable DAMON_STAT");
static unsigned long estimated_memory_bandwidth __read_mostly;
@@ -255,8 +258,11 @@ static int damon_stat_start(void)
if (!damon_stat_context)
return -ENOMEM;
err = damon_start(&damon_stat_context, 1, true);
- if (err)
+ if (err) {
+ damon_destroy_ctx(damon_stat_context);
+ damon_stat_context = NULL;
return err;
+ }
damon_stat_last_refresh_jiffies = jiffies;
call_control.data = damon_stat_context;
@@ -270,17 +276,23 @@ static void damon_stat_stop(void)
damon_stat_context = NULL;
}
+static bool damon_stat_enabled(void)
+{
+ if (!damon_stat_context)
+ return false;
+ return damon_is_running(damon_stat_context);
+}
+
static int damon_stat_enabled_store(
const char *val, const struct kernel_param *kp)
{
- bool is_enabled = enabled;
int err;
err = kstrtobool(val, &enabled);
if (err)
return err;
- if (is_enabled == enabled)
+ if (damon_stat_enabled() == enabled)
return 0;
if (!damon_initialized())
@@ -290,16 +302,17 @@ static int damon_stat_enabled_store(
*/
return 0;
- if (enabled) {
- err = damon_stat_start();
- if (err)
- enabled = false;
- return err;
- }
+ if (enabled)
+ return damon_stat_start();
damon_stat_stop();
return 0;
}
+static int damon_stat_enabled_load(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", damon_stat_enabled() ? 'Y' : 'N');
+}
+
static int __init damon_stat_init(void)
{
int err = 0;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 5186966dafb3..245d63808411 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -533,9 +533,14 @@ static ssize_t memcg_path_show(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
+ int len;
- return sysfs_emit(buf, "%s\n",
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ len = sysfs_emit(buf, "%s\n",
filter->memcg_path ? filter->memcg_path : "");
+ mutex_unlock(&damon_sysfs_lock);
+ return len;
}
static ssize_t memcg_path_store(struct kobject *kobj,
@@ -550,8 +555,13 @@ static ssize_t memcg_path_store(struct kobject *kobj,
return -ENOMEM;
strscpy(path, buf, count + 1);
+ if (!mutex_trylock(&damon_sysfs_lock)) {
+ kfree(path);
+ return -EBUSY;
+ }
kfree(filter->memcg_path);
filter->memcg_path = path;
+ mutex_unlock(&damon_sysfs_lock);
return count;
}
@@ -1187,8 +1197,13 @@ static ssize_t path_show(struct kobject *kobj,
{
struct damos_sysfs_quota_goal *goal = container_of(kobj,
struct damos_sysfs_quota_goal, kobj);
+ int len;
- return sysfs_emit(buf, "%s\n", goal->path ? goal->path : "");
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ len = sysfs_emit(buf, "%s\n", goal->path ? goal->path : "");
+ mutex_unlock(&damon_sysfs_lock);
+ return len;
}
static ssize_t path_store(struct kobject *kobj,
@@ -1203,8 +1218,13 @@ static ssize_t path_store(struct kobject *kobj,
return -ENOMEM;
strscpy(path, buf, count + 1);
+ if (!mutex_trylock(&damon_sysfs_lock)) {
+ kfree(path);
+ return -EBUSY;
+ }
kfree(goal->path);
goal->path = path;
+ mutex_unlock(&damon_sysfs_lock);
return count;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index c568d9058ff8..4e636647100c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -228,7 +228,8 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
page_cache_delete(mapping, folio, shadow);
}
-void filemap_free_folio(struct address_space *mapping, struct folio *folio)
+static void filemap_free_folio(const struct address_space *mapping,
+ struct folio *folio)
{
void (*free_folio)(struct folio *);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42c983821c03..970e077019b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1218,13 +1218,29 @@ retry:
static struct deferred_split *folio_split_queue_lock(struct folio *folio)
{
- return split_queue_lock(folio_nid(folio), folio_memcg(folio));
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+ queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
+ /*
+ * The memcg destruction path is acquiring the split queue lock for
+ * reparenting. Once you have it locked, it's safe to drop the rcu lock.
+ */
+ rcu_read_unlock();
+
+ return queue;
}
static struct deferred_split *
folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
{
- return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+ queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+ rcu_read_unlock();
+
+ return queue;
}
static inline void split_queue_unlock(struct deferred_split *queue)
@@ -3994,7 +4010,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1);
if (do_lru)
- unlock_page_lruvec(lruvec);
+ lruvec_unlock(lruvec);
if (ci)
swap_cluster_unlock(ci);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9413ed497be5..f24bf49be047 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4218,6 +4218,9 @@ static __init int hugetlb_add_param(char *s, int (*setup)(char *))
size_t len;
char *p;
+ if (!s)
+ return -EINVAL;
+
if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
return -EINVAL;
@@ -4784,6 +4787,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
return 0;
}
+#ifdef CONFIG_USERFAULTFD
+static bool hugetlb_can_userfault(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ return true;
+}
+
+static const struct vm_uffd_ops hugetlb_uffd_ops = {
+ .can_userfault = hugetlb_can_userfault,
+};
+#endif
+
/*
* When a new function is introduced to vm_operations_struct and added
* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
@@ -4797,6 +4812,9 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.close = hugetlb_vm_op_close,
.may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &hugetlb_uffd_ops,
+#endif
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index f83ae4998990..7693ccefd0c6 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -204,6 +204,7 @@ void __init hugetlb_cma_reserve(void)
*/
per_node = DIV_ROUND_UP(hugetlb_cma_size,
nodes_weight(hugetlb_bootmem_nodes));
+ per_node = round_up(per_node, PAGE_SIZE << order);
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
}
diff --git a/mm/internal.h b/mm/internal.h
index c693646e5b3f..5a2ddcf68e0b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -557,7 +557,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
-void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
@@ -1322,7 +1321,17 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);
+static inline bool deferred_pages_enabled(void)
+{
+ return static_branch_unlikely(&deferred_pages);
+}
+
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
+#else
+static inline bool deferred_pages_enabled(void)
+{
+ return false;
+}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void init_deferred_page(unsigned long pfn, int nid);
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 9eba46212edf..655dc5ce3240 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -736,10 +736,10 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
- memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+ memblock_free((void *)addr, KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
- memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
+ memblock_free(kfence_metadata_init, KFENCE_METADATA_SIZE);
kfence_metadata_init = NULL;
return false;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fa8201e23222..2eff0d6b622b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -241,7 +241,7 @@ static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
-static bool kmemleak_verbose;
+static bool kmemleak_verbose = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_VERBOSE);
module_param_named(verbose, kmemleak_verbose, bool, 0600);
static void kmemleak_disable(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 2505ce8b319c..a6a1c91e276d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -17,6 +17,7 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
+#include <linux/string_helpers.h>
#ifdef CONFIG_KEXEC_HANDOVER
#include <linux/libfdt.h>
@@ -384,26 +385,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
*/
void __init memblock_discard(void)
{
- phys_addr_t addr, size;
+ phys_addr_t size;
+ void *addr;
if (memblock.reserved.regions != memblock_reserved_init_regions) {
- addr = __pa(memblock.reserved.regions);
+ addr = memblock.reserved.regions;
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
if (memblock_reserved_in_slab)
- kfree(memblock.reserved.regions);
+ kfree(addr);
else
- memblock_free_late(addr, size);
+ memblock_free(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
- addr = __pa(memblock.memory.regions);
+ addr = memblock.memory.regions;
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
if (memblock_memory_in_slab)
- kfree(memblock.memory.regions);
+ kfree(addr);
else
- memblock_free_late(addr, size);
+ memblock_free(addr, size);
}
memblock_memory = NULL;
@@ -893,13 +895,81 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.memory, base, size);
}
+static unsigned long __free_reserved_area(phys_addr_t start, phys_addr_t end,
+ int poison)
+{
+ unsigned long pages = 0, pfn;
+
+ if (deferred_pages_enabled()) {
+ WARN(1, "Cannot free reserved memory because of deferred initialization of the memory map");
+ return 0;
+ }
+
+ for_each_valid_pfn(pfn, PFN_UP(start), PFN_DOWN(end)) {
+ struct page *page = pfn_to_page(pfn);
+ void *direct_map_addr;
+
+ /*
+ * 'direct_map_addr' might be different from the kernel virtual
+ * address because some architectures use aliases.
+ * Going via physical address, pfn_to_page() and page_address()
+ * ensures that we get a _writeable_ alias for the memset().
+ */
+ direct_map_addr = page_address(page);
+ /*
+ * Perform a kasan-unchecked memset() since this memory
+ * has not been initialized.
+ */
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
+ if ((unsigned int)poison <= 0xFF)
+ memset(direct_map_addr, poison, PAGE_SIZE);
+
+ free_reserved_page(page);
+ pages++;
+ }
+ return pages;
+}
+
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
+{
+ phys_addr_t start_pa, end_pa;
+ unsigned long pages;
+
+ /*
+ * end is the first address past the region and it may be beyond what
+ * __pa() or __pa_symbol() can handle.
+ * Use the address included in the range for the conversion and add back
+ * 1 afterwards.
+ */
+ if (__is_kernel((unsigned long)start)) {
+ start_pa = __pa_symbol(start);
+ end_pa = __pa_symbol(end - 1) + 1;
+ } else {
+ start_pa = __pa(start);
+ end_pa = __pa(end - 1) + 1;
+ }
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ if (start_pa < end_pa)
+ memblock_remove_range(&memblock.reserved,
+ start_pa, end_pa - start_pa);
+ }
+
+ pages = __free_reserved_area(start_pa, end_pa, poison);
+ if (pages && s)
+ pr_info("Freeing %s memory: %ldK\n", s, K(pages));
+
+ return pages;
+}
+
/**
* memblock_free - free boot memory allocation
* @ptr: starting address of the boot memory allocation
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
+ * If called after the buddy allocator is available, the memory is released to
+ * the buddy allocator.
*/
void __init_memblock memblock_free(void *ptr, size_t size)
{
@@ -913,17 +983,24 @@ void __init_memblock memblock_free(void *ptr, size_t size)
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
+ * If called after the buddy allocator is available, the memory is released to
+ * the buddy allocator.
*/
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
+ int ret;
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
- return memblock_remove_range(&memblock.reserved, base, size);
+ ret = memblock_remove_range(&memblock.reserved, base, size);
+
+ if (slab_is_available())
+ __free_reserved_area(base, base + size, -1);
+
+ return ret;
}
int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
@@ -973,7 +1050,7 @@ __init void memmap_init_kho_scratch_pages(void)
/*
* Initialize struct pages for free scratch memory.
* The struct pages for reserved scratch memory will be set up in
- * reserve_bootmem_region()
+ * memmap_init_reserved_pages()
*/
__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
@@ -1766,32 +1843,6 @@ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
return addr;
}
-/**
- * memblock_free_late - free pages directly to buddy allocator
- * @base: phys starting address of the boot memory block
- * @size: size of the boot memory block in bytes
- *
- * This is only useful when the memblock allocator has already been torn
- * down, but we are still initializing the system. Pages are released directly
- * to the buddy allocator.
- */
-void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
-{
- phys_addr_t cursor, end;
-
- end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n",
- __func__, &base, &end, (void *)_RET_IP_);
- kmemleak_free_part_phys(base, size);
- cursor = PFN_UP(base);
- end = PFN_DOWN(base + size);
-
- for (; cursor < end; cursor++) {
- memblock_free_pages(cursor, 0);
- totalram_pages_inc();
- }
-}
-
/*
* Remaining API functions
*/
@@ -2255,6 +2306,31 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
+/*
+ * Initialised pages do not have PageReserved set. This function is called
+ * for each reserved range and marks the pages PageReserved.
+ * When deferred initialization of struct pages is enabled it also ensures
+ * that struct pages are properly initialised.
+ */
+static void __init memmap_init_reserved_range(phys_addr_t start,
+ phys_addr_t end, int nid)
+{
+ unsigned long pfn;
+
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
+
+ init_deferred_page(pfn, nid);
+
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
+ }
+}
+
static void __init memmap_init_reserved_pages(void)
{
struct memblock_region *region;
@@ -2274,7 +2350,7 @@ repeat:
end = start + region->size;
if (memblock_is_nomap(region))
- reserve_bootmem_region(start, end, nid);
+ memmap_init_reserved_range(start, end, nid);
memblock_set_node(start, region->size, &memblock.reserved, nid);
}
@@ -2299,7 +2375,7 @@ repeat:
if (!numa_valid_node(nid))
nid = early_pfn_to_nid(PFN_DOWN(start));
- reserve_bootmem_region(start, end, nid);
+ memmap_init_reserved_range(start, end, nid);
}
}
}
@@ -2449,7 +2525,7 @@ int reserve_mem_release_by_name(const char *name)
return 0;
start = phys_to_virt(map->start);
- end = start + map->size - 1;
+ end = start + map->size;
snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
free_reserved_area(start, end, 0, buf);
map->size = 0;
@@ -2525,7 +2601,7 @@ static int __init prepare_kho_fdt(void)
if (err)
goto err_unpreserve_fdt;
- err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);
+ err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt));
if (err)
goto err_unpreserve_fdt;
@@ -2570,7 +2646,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void)
if (fdt)
return fdt;
- err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL);
if (err) {
if (err != -ENOENT)
pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
@@ -2657,23 +2733,25 @@ static int __init reserve_mem(char *p)
int len;
if (!p)
- return -EINVAL;
+ goto err_param;
/* Check if there's room for more reserved memory */
- if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
+ if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) {
+ pr_err("reserve_mem: no more room for reserved memory\n");
return -EBUSY;
+ }
oldp = p;
size = memparse(p, &p);
if (!size || p == oldp)
- return -EINVAL;
+ goto err_param;
if (*p != ':')
- return -EINVAL;
+ goto err_param;
align = memparse(p+1, &p);
if (*p != ':')
- return -EINVAL;
+ goto err_param;
/*
* memblock_phys_alloc() doesn't like a zero size align,
@@ -2687,7 +2765,7 @@ static int __init reserve_mem(char *p)
/* name needs to have length but not too big */
if (!len || len >= RESERVE_MEM_NAME_SIZE)
- return -EINVAL;
+ goto err_param;
/* Make sure that name has text */
for (p = name; *p; p++) {
@@ -2695,11 +2773,13 @@ static int __init reserve_mem(char *p)
break;
}
if (!*p)
- return -EINVAL;
+ goto err_param;
/* Make sure the name is not already used */
- if (reserve_mem_find_by_name(name, &start, &tmp))
+ if (reserve_mem_find_by_name(name, &start, &tmp)) {
+ pr_err("reserve_mem: name \"%s\" was already used\n", name);
return -EBUSY;
+ }
/* Pick previous allocations up from KHO if available */
if (reserve_mem_kho_revive(name, size, align))
@@ -2707,16 +2787,22 @@ static int __init reserve_mem(char *p)
/* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
- if (!start)
+ if (!start) {
+ pr_err("reserve_mem: memblock allocation failed\n");
return -ENOMEM;
+ }
reserved_mem_add(start, size, name);
return 1;
+err_param:
+ pr_err("reserve_mem: empty or malformed parameter\n");
+ return -EINVAL;
}
__setup("reserve_mem=", reserve_mem);
-#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
static const char * const flagname[] = {
[ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
[ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
@@ -2763,10 +2849,8 @@ static int memblock_debug_show(struct seq_file *m, void *private)
}
DEFINE_SHOW_ATTRIBUTE(memblock_debug);
-static int __init memblock_init_debugfs(void)
+static inline void memblock_debugfs_expose_arrays(struct dentry *root)
{
- struct dentry *root = debugfs_create_dir("memblock", NULL);
-
debugfs_create_file("memory", 0444, root,
&memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", 0444, root,
@@ -2775,7 +2859,48 @@ static int __init memblock_init_debugfs(void)
debugfs_create_file("physmem", 0444, root, &physmem,
&memblock_debug_fops);
#endif
+}
+
+#else
+
+static inline void memblock_debugfs_expose_arrays(struct dentry *root) { }
+
+#endif /* CONFIG_ARCH_KEEP_MEMBLOCK */
+
+static int memblock_reserve_mem_show(struct seq_file *m, void *private)
+{
+ struct reserve_mem_table *map;
+ char txtsz[16];
+
+ guard(mutex)(&reserve_mem_lock);
+ for (int i = 0; i < reserved_mem_count; i++) {
+ map = &reserved_mem_table[i];
+ if (!map->size)
+ continue;
+
+ memset(txtsz, 0, sizeof(txtsz));
+ string_get_size(map->size, 1, STRING_UNITS_2, txtsz, sizeof(txtsz));
+ seq_printf(m, "%s\t\t(%s)\n", map->name, txtsz);
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(memblock_reserve_mem);
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root;
+
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !reserved_mem_count)
+ return 0;
+
+ root = debugfs_create_dir("memblock", NULL);
+
+ if (reserved_mem_count)
+ debugfs_create_file("reserve_mem_param", 0444, root, NULL,
+ &memblock_reserve_mem_fops);
+ memblock_debugfs_expose_arrays(root);
return 0;
}
__initcall(memblock_init_debugfs);
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 437cd25784fe..433bba9dfe71 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_entries;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -624,12 +625,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
if (!do_memsw_account())
return;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
@@ -644,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
@@ -665,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
- css_put(&memcg->css);
+ rcu_read_unlock();
+ obj_cgroup_put(objcg);
}
/*
@@ -1884,6 +1887,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 1b969294ea6a..f92f81108d5e 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 051b82ebf371..c03d4787d466 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -206,26 +206,100 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
}
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent,
+ int nid)
{
struct obj_cgroup *objcg, *iter;
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid];
- objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
-
- spin_lock_irq(&objcg_lock);
-
+ objcg = rcu_replace_pointer(pn->objcg, NULL, true);
/* 1) Ready to reparent active objcg. */
- list_add(&objcg->list, &memcg->objcg_list);
+ list_add(&objcg->list, &pn->objcg_list);
/* 2) Reparent active objcg and already reparented objcgs to parent. */
- list_for_each_entry(iter, &memcg->objcg_list, list)
+ list_for_each_entry(iter, &pn->objcg_list, list)
WRITE_ONCE(iter->memcg, parent);
/* 3) Move already reparented objcgs to the parent's list */
- list_splice(&memcg->objcg_list, &parent->objcg_list);
+ list_splice(&pn->objcg_list, &parent_pn->objcg_list);
+
+ return objcg;
+}
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ /*
+ * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+ * to read its stats accurately , and conservatively flush @parent's
+ * stats after reparenting to avoid hiding a potentially large stat
+ * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+ */
+ __mem_cgroup_flush_stats(memcg, true);
+
+ /* The following counts are all non-hierarchical and need to be reparented. */
+ reparent_memcg1_state_local(memcg, parent);
+ reparent_memcg1_lruvec_state_local(memcg, parent);
+
+ __mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
+static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ spin_lock_irq(&objcg_lock);
+ spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1);
+ spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2);
+}
+
+static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock);
+ spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock);
spin_unlock_irq(&objcg_lock);
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ int nid;
+
+ for_each_node(nid) {
+retry:
+ if (lru_gen_enabled())
+ max_lru_gen_memcg(parent, nid);
+
+ reparent_locks(memcg, parent, nid);
+
+ if (lru_gen_enabled()) {
+ if (!recheck_lru_gen_max_memcg(parent, nid)) {
+ reparent_unlocks(memcg, parent, nid);
+ cond_resched();
+ goto retry;
+ }
+ lru_gen_reparent_memcg(memcg, parent, nid);
+ } else {
+ lru_reparent_memcg(memcg, parent, nid);
+ }
- percpu_ref_kill(&objcg->refcnt);
+ objcg = __memcg_reparent_objcgs(memcg, parent, nid);
+
+ reparent_unlocks(memcg, parent, nid);
+
+ percpu_ref_kill(&objcg->refcnt);
+ }
+
+ reparent_state_local(memcg, parent);
}
/*
@@ -241,7 +315,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
EXPORT_SYMBOL(memcg_bpf_enabled_key);
/**
- * mem_cgroup_css_from_folio - css of the memcg associated with a folio
+ * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio
* @folio: folio of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
@@ -251,14 +325,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key);
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
+struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
- if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
- memcg = root_mem_cgroup;
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return &root_mem_cgroup->css;
- return &memcg->css;
+ memcg = get_mem_cgroup_from_folio(folio);
+
+ return memcg ? &memcg->css : &root_mem_cgroup->css;
}
/**
@@ -449,6 +525,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+ enum node_stat_item idx, long val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+ struct mem_cgroup_per_node *child_pn, *parent_pn;
+
+ child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
+ parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+ __mod_memcg_lruvec_state(child_pn, idx, -value);
+ __mod_memcg_lruvec_state(parent_pn, idx, value);
+ }
+}
+#endif
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -508,7 +608,7 @@ static inline int memcg_events_index(enum vm_event_item idx)
struct memcg_vmstats_percpu {
/* Stats updates since the last flush */
- unsigned int stats_updates;
+ unsigned long stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
struct memcg_vmstats_percpu __percpu *parent_pcpu;
@@ -539,7 +639,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic_t stats_updates;
+ atomic_long_t stats_updates;
};
/*
@@ -565,16 +665,16 @@ static u64 flush_last_time;
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic_read(&vmstats->stats_updates) >
+ return atomic_long_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, long val,
int cpu)
{
struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- unsigned int stats_updates;
+ unsigned long stats_updates;
if (!val)
return;
@@ -597,7 +697,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
continue;
stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
- atomic_add(stats_updates, &statc->vmstats->stats_updates);
+ atomic_long_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -605,7 +705,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_long_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -684,31 +784,70 @@ static int memcg_page_state_unit(int item);
* Normalize the value passed into memcg_rstat_updated() to be in pages. Round
* up non-zero sub-page updates to 1 page as zero page updates are ignored.
*/
-static int memcg_state_val_in_pages(int idx, int val)
+static long memcg_state_val_in_pages(int idx, long val)
{
int unit = memcg_page_state_unit(idx);
+ long res;
if (!val || unit == PAGE_SIZE)
return val;
- else
- return max(val * unit / PAGE_SIZE, 1UL);
+
+ /* Get the absolute value of (val * unit / PAGE_SIZE). */
+ res = mult_frac(abs(val), unit, PAGE_SIZE);
+ /* Round up zero values. */
+ res = res ? : 1;
+
+ return val < 0 ? -res : res;
}
-/**
- * mod_memcg_state - update cgroup memory statistics
- * @memcg: the memory cgroup
- * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
- * @val: delta to add to the counter, can be negative
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
*/
-void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
- int val)
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg,
+ bool *rcu_locked)
{
- int i = memcg_stats_index(idx);
- int cpu;
+ /* Rebinding can cause this value to be changed at runtime */
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ *rcu_locked = false;
+ return memcg;
+ }
- if (mem_cgroup_disabled())
+ rcu_read_lock();
+ *rcu_locked = true;
+
+ while (memcg_is_dying(memcg))
+ memcg = parent_mem_cgroup(memcg);
+
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(bool rcu_locked)
+{
+ if (!rcu_locked)
return;
+ rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg,
+ bool *rcu_locked)
+{
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(bool rcu_locked)
+{
+}
+#endif
+
+static void __mod_memcg_state(struct mem_cgroup *memcg,
+ enum memcg_stat_item idx, long val)
+{
+ int i = memcg_stats_index(idx);
+ int cpu;
+
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -717,11 +856,31 @@ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
memcg_rstat_updated(memcg, val, cpu);
+
trace_mod_memcg_state(memcg, idx, val);
put_cpu();
}
+/**
+ * mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+ int val)
+{
+ bool rcu_locked = false;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = get_non_dying_memcg_start(memcg, &rcu_locked);
+ __mod_memcg_state(memcg, idx, val);
+ get_non_dying_memcg_end(rcu_locked);
+}
+
#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
@@ -739,23 +898,27 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ unsigned long value = memcg_page_state_local(memcg, idx);
+
+ __mod_memcg_state(memcg, idx, -value);
+ __mod_memcg_state(parent, idx, value);
+}
#endif
-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx,
- int val)
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+ enum node_stat_item idx, long val)
{
- struct mem_cgroup_per_node *pn;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = pn->memcg;
int i = memcg_stats_index(idx);
int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- memcg = pn->memcg;
-
cpu = get_cpu();
/* Update memcg */
@@ -771,6 +934,24 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
put_cpu();
}
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx,
+ int val)
+{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ bool rcu_locked = false;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = get_non_dying_memcg_start(pn->memcg, &rcu_locked);
+ pn = memcg->nodeinfo[pgdat->node_id];
+
+ __mod_memcg_lruvec_state(pn, idx, val);
+
+ get_non_dying_memcg_end(rcu_locked);
+}
+
/**
* mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
@@ -991,17 +1172,23 @@ again:
/**
* get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
* @folio: folio from which memcg should be extracted.
+ *
+ * See folio_memcg() for folio->objcg/memcg binding rules.
*/
struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return NULL;
+ if (!folio_memcg_charged(folio))
+ return root_mem_cgroup;
+
rcu_read_lock();
- if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
+ do {
+ memcg = folio_memcg(folio);
+ } while (unlikely(!css_tryget(&memcg->css)));
rcu_read_unlock();
return memcg;
}
@@ -1198,23 +1385,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
}
}
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return;
-
- memcg = folio_memcg(folio);
-
- if (!memcg)
- VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
- else
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
-}
-#endif
-
/**
* folio_lruvec_lock - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
@@ -1224,14 +1394,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
* - folio_test_lru false
* - folio frozen (refcount of 0)
*
- * Return: The lruvec this folio is on with its lock held.
+ * Return: The lruvec this folio is on with its lock held and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock(&lruvec->lru_lock);
+ goto retry;
+ }
return lruvec;
}
@@ -1246,14 +1422,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
- * disabled.
+ * disabled and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irq(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ goto retry;
+ }
return lruvec;
}
@@ -1269,15 +1451,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
- * disabled.
+ * disabled and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
return lruvec;
}
@@ -1293,7 +1481,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
* to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zid, int nr_pages)
+ int zid, long nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
@@ -1310,7 +1498,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
size = *lru_size;
if (WARN_ONCE(size < 0,
- "%s(%p, %d, %d): lru_size %ld\n",
+ "%s(%p, %d, %ld): lru_size %ld\n",
__func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
@@ -2581,17 +2769,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
-static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
{
VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
/*
- * Any of the following ensures page's memcg stability:
+ * Any of the following ensures folio's objcg stability:
*
* - the page lock
* - LRU isolation
* - exclusive reference
*/
- folio->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)objcg;
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
@@ -2693,14 +2881,26 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg = NULL;
+ int nid = numa_node_id();
+
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
- for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
if (likely(objcg && obj_cgroup_tryget(objcg)))
- break;
- objcg = NULL;
+ return objcg;
}
+
+ return NULL;
+}
+
+static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+
+ rcu_read_lock();
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ rcu_read_unlock();
+
return objcg;
}
@@ -2759,6 +2959,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
{
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ int nid = numa_node_id();
if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
return NULL;
@@ -2775,53 +2976,39 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
* Objcg reference is kept by the task, so it's safe
* to use the objcg by the current task.
*/
- return objcg;
+ return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
memcg = this_cpu_read(int_active_memcg);
if (unlikely(memcg))
goto from_memcg;
- return NULL;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
from_memcg:
- objcg = NULL;
- for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
/*
* Memcg pointer is protected by scope (see set_active_memcg())
* and is pinning the corresponding objcg, so objcg can't go
* away and can be used within the scope without any additional
* protection.
*/
- objcg = rcu_dereference_check(memcg->objcg, 1);
+ objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1);
if (likely(objcg))
- break;
+ return objcg;
}
- return objcg;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_online())
- return NULL;
-
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
+ objcg = folio_objcg(folio);
+ if (objcg)
obj_cgroup_get(objcg);
- } else {
- struct mem_cgroup *memcg;
- rcu_read_lock();
- memcg = __folio_memcg(folio);
- if (memcg)
- objcg = __get_obj_cgroup_from_memcg(memcg);
- else
- objcg = NULL;
- rcu_read_unlock();
- }
return objcg;
}
@@ -2922,7 +3109,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
int ret = 0;
objcg = current_obj_cgroup();
- if (objcg) {
+ if (objcg && !obj_cgroup_is_root(objcg)) {
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
@@ -3251,7 +3438,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
* obj_cgroup_get() is used to get a permanent reference.
*/
objcg = current_obj_cgroup();
- if (!objcg)
+ if (!objcg || obj_cgroup_is_root(objcg))
return true;
/*
@@ -3383,33 +3570,20 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
return;
new_refs = (1 << (old_order - new_order)) - 1;
- css_get_many(&__folio_memcg(folio)->css, new_refs);
+ obj_cgroup_get_many(folio_objcg(folio), new_refs);
}
-static int memcg_online_kmem(struct mem_cgroup *memcg)
+static void memcg_online_kmem(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg;
-
if (mem_cgroup_kmem_disabled())
- return 0;
+ return;
if (unlikely(mem_cgroup_is_root(memcg)))
- return 0;
-
- objcg = obj_cgroup_alloc();
- if (!objcg)
- return -ENOMEM;
-
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
- obj_cgroup_get(objcg);
- memcg->orig_objcg = objcg;
+ return;
static_branch_enable(&memcg_kmem_online_key);
memcg->kmemcg_id = memcg->id.id;
-
- return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
@@ -3423,16 +3597,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
return;
parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
memcg_reparent_list_lrus(memcg, parent);
-
- /*
- * Objcg's reparenting must be after list_lru's, make sure list_lru
- * helpers won't use parent's list_lru until child is drained.
- */
- memcg_reparent_objcgs(memcg, parent);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -3705,8 +3870,6 @@ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, un
break;
}
memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
}
return memcg;
}
@@ -3771,6 +3934,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn->lruvec_stats_percpu)
goto fail;
+ INIT_LIST_HEAD(&pn->objcg_list);
+
lruvec_init(&pn->lruvec);
pn->memcg = memcg;
@@ -3785,10 +3950,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- obj_cgroup_put(memcg->orig_objcg);
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ if (!pn)
+ continue;
- for_each_node(node)
- free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
+ obj_cgroup_put(pn->orig_objcg);
+ free_mem_cgroup_per_node_info(pn);
+ }
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3859,7 +4028,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -3935,9 +4103,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct obj_cgroup *objcg;
+ int nid;
- if (memcg_online_kmem(memcg))
- goto remove_id;
+ memcg_online_kmem(memcg);
/*
* A memcg must be visible for expand_shrinker_info()
@@ -3947,6 +4116,20 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
+ for_each_node(nid) {
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ goto free_objcg;
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ objcg->is_root = true;
+
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
+ obj_cgroup_get(objcg);
+ memcg->nodeinfo[nid]->orig_objcg = objcg;
+ }
+
if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
FLUSH_TIME);
@@ -3969,9 +4152,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
+free_objcg:
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+
+ objcg = rcu_replace_pointer(pn->objcg, NULL, true);
+ if (objcg)
+ percpu_ref_kill(&objcg->refcnt);
+
+ if (pn->orig_objcg) {
+ obj_cgroup_put(pn->orig_objcg);
+ /*
+ * Reset pn->orig_objcg to NULL to prevent
+ * obj_cgroup_put() from being called again in
+ * __mem_cgroup_free().
+ */
+ pn->orig_objcg = NULL;
+ }
+ }
+ free_shrinker_info(memcg);
offline_kmem:
memcg_offline_kmem(memcg);
-remove_id:
mem_cgroup_private_id_remove(memcg);
return -ENOMEM;
}
@@ -3989,6 +4190,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_deferred_split_queue(memcg);
+ /*
+ * The reparenting of objcg must be after the reparenting of the
+ * list_lru and deferred_split_queue above, which ensures that they will
+ * not mistakenly get the parent list_lru and deferred_split_queue.
+ */
+ memcg_reparent_objcgs(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
@@ -4221,8 +4428,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic_read(&memcg->vmstats->stats_updates))
- atomic_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_long_read(&memcg->vmstats->stats_updates))
+ atomic_long_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4799,16 +5006,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
- int ret;
-
- ret = try_charge(memcg, gfp, folio_nr_pages(folio));
- if (ret)
- goto out;
+ int ret = 0;
+ struct obj_cgroup *objcg;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ /* Do not account at the root objcg level. */
+ if (!obj_cgroup_is_root(objcg))
+ ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio));
+ if (ret) {
+ obj_cgroup_put(objcg);
+ return ret;
+ }
+ commit_charge(folio, objcg);
memcg1_commit_charge(folio, memcg);
-out:
+
return ret;
}
@@ -4894,7 +5105,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
}
struct uncharge_gather {
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
@@ -4908,58 +5119,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(ug->objcg);
if (ug->nr_memory) {
- memcg_uncharge(ug->memcg, ug->nr_memory);
+ memcg_uncharge(memcg, ug->nr_memory);
if (ug->nr_kmem) {
- mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
- memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
+ mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
+ memcg1_account_kmem(memcg, -ug->nr_kmem);
}
- memcg1_oom_recover(ug->memcg);
+ memcg1_oom_recover(memcg);
}
- memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ rcu_read_unlock();
/* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
+ obj_cgroup_put(ug->objcg);
}
static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
long nr_pages;
- struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * folio memcg or objcg at this point, we have fully
- * exclusive access to the folio.
+ * folio objcg at this point, we have fully exclusive
+ * access to the folio.
*/
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
- /*
- * This get matches the put at the end of the function and
- * kmem pages do not hold memcg references anymore.
- */
- memcg = get_mem_cgroup_from_objcg(objcg);
- } else {
- memcg = __folio_memcg(folio);
- }
-
- if (!memcg)
+ objcg = folio_objcg(folio);
+ if (!objcg)
return;
- if (ug->memcg != memcg) {
- if (ug->memcg) {
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = memcg;
+ ug->objcg = objcg;
ug->nid = folio_nid(folio);
- /* pairs with css_put in uncharge_batch */
- css_get(&memcg->css);
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(objcg);
}
nr_pages = folio_nr_pages(folio);
@@ -4967,20 +5172,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
-
- folio->memcg_data = 0;
- obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
ug->nr_memory += nr_pages;
ug->pgpgout++;
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
- folio->memcg_data = 0;
}
- css_put(&memcg->css);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
}
void __mem_cgroup_uncharge(struct folio *folio)
@@ -5004,7 +5206,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
uncharge_gather_clear(&ug);
for (i = 0; i < folios->nr; i++)
uncharge_folio(folios->folios[i], &ug);
- if (ug.memcg)
+ if (ug.objcg)
uncharge_batch(&ug);
}
@@ -5021,6 +5223,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(new);
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
@@ -5035,21 +5238,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
if (folio_memcg_charged(new))
return;
- memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
- if (!memcg)
+ objcg = folio_objcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, old);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/* Force-charge the new page. The old one will be freed soon */
- if (!mem_cgroup_is_root(memcg)) {
+ if (!obj_cgroup_is_root(objcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
}
- css_get(&memcg->css);
- commit_charge(new, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(new, objcg);
memcg1_commit_charge(new, memcg);
+ rcu_read_unlock();
}
/**
@@ -5065,7 +5271,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
*/
void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
@@ -5076,18 +5282,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
if (mem_cgroup_disabled())
return;
- memcg = folio_memcg(old);
+ objcg = folio_objcg(old);
/*
- * Note that it is normal to see !memcg for a hugetlb folio.
+ * Note that it is normal to see !objcg for a hugetlb folio.
* For e.g, it could have been allocated when memory_hugetlb_accounting
* was not selected.
*/
- VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
- if (!memcg)
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old);
+ if (!objcg)
return;
- /* Transfer the charge and the css ref */
- commit_charge(new, memcg);
+ /* Transfer the charge and the objcg ref */
+ commit_charge(new, objcg);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
@@ -5270,22 +5476,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
if (do_memsw_account())
return 0;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return 0;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ rcu_read_unlock();
return 0;
}
memcg = mem_cgroup_private_id_get_online(memcg, nr_pages);
+ /* memcg is pined by memcg ID. */
+ rcu_read_unlock();
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
@@ -5343,27 +5554,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
bool mem_cgroup_swap_full(struct folio *folio)
{
struct mem_cgroup *memcg;
+ bool ret = false;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (vm_swap_full())
return true;
- if (do_memsw_account())
- return false;
+ if (do_memsw_account() || !folio_memcg_charged(folio))
+ return ret;
+ rcu_read_lock();
memcg = folio_memcg(folio);
- if (!memcg)
- return false;
-
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
- usage * 2 >= READ_ONCE(memcg->swap.max))
- return true;
+ usage * 2 >= READ_ONCE(memcg->swap.max)) {
+ ret = true;
+ break;
+ }
}
+ rcu_read_unlock();
- return false;
+ return ret;
}
static int __init setup_swap_account(char *s)
@@ -5559,6 +5772,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
+ if (obj_cgroup_is_root(objcg))
+ return;
+
VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
/* PF_MEMALLOC context, charging must succeed */
@@ -5588,6 +5804,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
+ if (obj_cgroup_is_root(objcg))
+ return;
+
obj_cgroup_uncharge(objcg, size);
rcu_read_lock();
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
index bc7f4f045edf..59de210bee5f 100644
--- a/mm/memfd_luo.c
+++ b/mm/memfd_luo.c
@@ -50,6 +50,11 @@
* memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
* is maintained.
*
+ * Seals
+ * File seals set on the memfd are preserved and re-applied on restore.
+ * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may
+ * be present; preservation fails with ``-EOPNOTSUPP`` otherwise.
+ *
* Non-Preserved Properties
* ========================
*
@@ -61,10 +66,6 @@
* A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
* ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
* again after restore via ``fcntl()``.
- *
- * Seals
- * File seals are not preserved. The file is unsealed on restore and if
- * needed, must be sealed again via ``fcntl()``.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -105,7 +106,6 @@ static int memfd_luo_preserve_folios(struct file *file,
if (!size) {
*nr_foliosp = 0;
*out_folios_ser = NULL;
- memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
return 0;
}
@@ -260,7 +260,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
struct inode *inode = file_inode(args->file);
struct memfd_luo_folio_ser *folios_ser;
struct memfd_luo_ser *ser;
- u64 nr_folios;
+ u64 nr_folios, inode_size;
int err = 0, seals;
inode_lock(inode);
@@ -286,7 +286,18 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
}
ser->pos = args->file->f_pos;
- ser->size = i_size_read(inode);
+ inode_size = i_size_read(inode);
+
+ /*
+ * memfd_pin_folios() caps at UINT_MAX folios; refuse larger
+ * files to avoid silently preserving only a prefix.
+ */
+ if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) {
+ err = -EFBIG;
+ goto err_free_ser;
+ }
+
+ ser->size = inode_size;
ser->seals = seals;
err = memfd_luo_preserve_folios(args->file, &ser->folios,
@@ -410,6 +421,7 @@ static int memfd_luo_retrieve_folios(struct file *file,
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
struct folio *folio;
+ long npages, nr_added_pages = 0;
int err = -EIO;
long i;
@@ -427,6 +439,7 @@ static int memfd_luo_retrieve_folios(struct file *file,
if (!folio) {
pr_err("Unable to restore folio at physical address: %llx\n",
phys);
+ err = -EIO;
goto put_folios;
}
index = pfolio->index;
@@ -456,21 +469,26 @@ static int memfd_luo_retrieve_folios(struct file *file,
if (flags & MEMFD_LUO_FOLIO_DIRTY)
folio_mark_dirty(folio);
- err = shmem_inode_acct_blocks(inode, 1);
+ npages = folio_nr_pages(folio);
+ err = shmem_inode_acct_blocks(inode, npages);
if (err) {
- pr_err("shmem: failed to account folio index %ld: %d\n",
- i, err);
- goto unlock_folio;
+ pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n",
+ i, npages, err);
+ goto remove_from_cache;
}
- shmem_recalc_inode(inode, 1, 0);
+ nr_added_pages += npages;
folio_add_lru(folio);
folio_unlock(folio);
folio_put(folio);
}
+ shmem_recalc_inode(inode, nr_added_pages, 0);
+
return 0;
+remove_from_cache:
+ filemap_remove_folio(folio);
unlock_folio:
folio_unlock(folio);
folio_put(folio);
@@ -481,12 +499,19 @@ put_folios:
*/
for (long j = i + 1; j < nr_folios; j++) {
const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
+ phys_addr_t phys;
+
+ if (!pfolio->pfn)
+ continue;
- folio = kho_restore_folio(pfolio->pfn);
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
if (folio)
folio_put(folio);
}
+ shmem_recalc_inode(inode, nr_added_pages, 0);
+
return err;
}
@@ -525,7 +550,7 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
}
vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
- file->f_inode->i_size = ser->size;
+ i_size_write(file_inode(file), ser->size);
if (ser->nr_folios) {
folios_ser = kho_restore_vmalloc(&ser->folios);
@@ -560,6 +585,11 @@ static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
return shmem_file(file) && !inode->i_nlink;
}
+static unsigned long memfd_luo_get_id(struct file *file)
+{
+ return (unsigned long)file_inode(file);
+}
+
static const struct liveupdate_file_ops memfd_luo_file_ops = {
.freeze = memfd_luo_freeze,
.finish = memfd_luo_finish,
@@ -567,6 +597,7 @@ static const struct liveupdate_file_ops memfd_luo_file_ops = {
.preserve = memfd_luo_preserve,
.unpreserve = memfd_luo_unpreserve,
.can_preserve = memfd_luo_can_preserve,
+ .get_id = memfd_luo_get_id,
.owner = THIS_MODULE,
};
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2e136b738889..4e4421b22b59 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3706,18 +3706,19 @@ static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
new_wi_state->iw_table[i] = 1;
mutex_lock(&wi_state_lock);
- if (!input) {
- old_wi_state = rcu_dereference_protected(wi_state,
- lockdep_is_held(&wi_state_lock));
- if (!old_wi_state)
- goto update_wi_state;
- if (input == old_wi_state->mode_auto) {
- mutex_unlock(&wi_state_lock);
- return count;
- }
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
- memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
- nr_node_ids * sizeof(u8));
+ if (old_wi_state && input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return count;
+ }
+
+ if (!input) {
+ if (old_wi_state)
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
goto update_wi_state;
}
@@ -3787,9 +3788,11 @@ static void wi_state_free(void)
}
}
-static struct kobj_attribute wi_auto_attr =
- __ATTR(auto, 0664, weighted_interleave_auto_show,
- weighted_interleave_auto_store);
+static struct kobj_attribute wi_auto_attr = {
+ .attr = { .name = "auto", .mode = 0664 },
+ .show = weighted_interleave_auto_show,
+ .store = weighted_interleave_auto_store,
+};
static void wi_cleanup(void) {
sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
diff --git a/mm/memremap.c b/mm/memremap.c
index ac7be07e3361..053842d45cb1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -454,7 +454,7 @@ void free_zone_device_folio(struct folio *folio)
if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free))
break;
pgmap->ops->folio_free(folio);
- percpu_ref_put_many(&folio->pgmap->ref, nr);
+ percpu_ref_put_many(&pgmap->ref, nr);
break;
case MEMORY_DEVICE_GENERIC:
diff --git a/mm/migrate.c b/mm/migrate.c
index 76142a02192b..8a64291ab5b4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -672,6 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
+ rcu_read_lock();
memcg = folio_memcg(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
@@ -699,6 +700,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
+ rcu_read_unlock();
}
local_irq_enable();
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 2912eba575d5..fbfe5715f635 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -175,12 +175,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
return migrate_vma_collect_skip(start, end, walk);
}
- if (softleaf_is_migration(entry)) {
- softleaf_entry_wait_on_locked(entry, ptl);
- spin_unlock(ptl);
- return -EAGAIN;
- }
-
if (softleaf_is_device_private_write(entry))
write = MIGRATE_PFN_WRITE;
} else {
diff --git a/mm/mlock.c b/mm/mlock.c
index fdbd1434a35f..8c227fefa2df 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
}
if (lruvec)
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
folios_put(fbatch);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 79f93f2a90cf..f9f8e1af921c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -783,31 +783,6 @@ void __meminit init_deferred_page(unsigned long pfn, int nid)
__init_deferred_page(pfn, nid);
}
-/*
- * Initialised pages do not have PageReserved set. This function is
- * called for each range allocated by the bootmem allocator and
- * marks the pages PageReserved. The remaining valid pages are later
- * sent to the buddy page allocator.
- */
-void __meminit reserve_bootmem_region(phys_addr_t start,
- phys_addr_t end, int nid)
-{
- unsigned long pfn;
-
- for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
- struct page *page = pfn_to_page(pfn);
-
- __init_deferred_page(pfn, nid);
-
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
-}
-
/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 110d47a36d4b..9cbf932b028c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
}
/* Set nr_ptes number of ptes, starting from idx */
-static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
- int idx, bool set_write, struct mmu_gather *tlb)
+static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
+ int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
{
/*
* Advance the position in the batch by idx; note that if idx > 0,
@@ -143,7 +143,7 @@ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long add
* !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
* that the ptes point to consecutive pages of the same anon large folio.
*/
-static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len,
struct page *first_page, bool expected_anon_exclusive)
{
int idx;
@@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
* pte of the batch. Therefore, we must individually check all pages and
* retrieve sub-batches.
*/
-static void commit_anon_folio_batch(struct vm_area_struct *vma,
+static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
@@ -188,7 +188,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma,
}
}
-static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
@@ -211,6 +211,111 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
}
+static long change_softleaf_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags)
+{
+ const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ softleaf_t entry = softleaf_from_pte(oldpte);
+ pte_t newpte;
+
+ if (softleaf_is_migration_write(entry)) {
+ const struct folio *folio = softleaf_to_folio(entry);
+
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ if (folio_test_anon(folio))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ } else if (softleaf_is_device_private_write(entry)) {
+ /*
+ * We do not preserve soft-dirtiness. See
+ * copy_nonpresent_pte() for explanation.
+ */
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (softleaf_is_marker(entry)) {
+ /*
+ * Ignore error swap entries unconditionally,
+ * because any access should sigbus/sigsegv
+ * anyway.
+ */
+ if (softleaf_is_poison_marker(entry) ||
+ softleaf_is_guard_marker(entry))
+ return 0;
+ /*
+ * If this is uffd-wp pte marker and we'd like
+ * to unprotect it, drop it; the next page
+ * fault will trigger without uffd trapping.
+ */
+ if (uffd_wp_resolve) {
+ pte_clear(vma->vm_mm, addr, pte);
+ return 1;
+ }
+ return 0;
+ } else {
+ newpte = oldpte;
+ }
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+
+ if (!pte_same(oldpte, newpte)) {
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
+ return 1;
+ }
+ return 0;
+}
+
+static __always_inline void change_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ int nr_ptes, unsigned long end, pgprot_t newprot,
+ struct folio *folio, struct page *page, unsigned long cp_flags)
+{
+ const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ pte_t ptent, oldpte;
+
+ oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
+ ptent = pte_modify(oldpte, newprot);
+
+ if (uffd_wp)
+ ptent = pte_mkuffd_wp(ptent);
+ else if (uffd_wp_resolve)
+ ptent = pte_clear_uffd_wp(ptent);
+
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio, page,
+ addr, ptep, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -221,7 +326,6 @@ static long change_pte_range(struct mmu_gather *tlb,
bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
- bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
@@ -242,7 +346,6 @@ static long change_pte_range(struct mmu_gather *tlb,
int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
struct folio *folio = NULL;
struct page *page;
- pte_t ptent;
/* Already in the desired state. */
if (prot_numa && pte_protnone(oldpte))
@@ -268,34 +371,20 @@ static long change_pte_range(struct mmu_gather *tlb,
nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
- oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
- ptent = pte_modify(oldpte, newprot);
-
- if (uffd_wp)
- ptent = pte_mkuffd_wp(ptent);
- else if (uffd_wp_resolve)
- ptent = pte_clear_uffd_wp(ptent);
-
/*
- * In some writable, shared mappings, we might want
- * to catch actual write access -- see
- * vma_wants_writenotify().
- *
- * In all writable, private mappings, we have to
- * properly handle COW.
- *
- * In both cases, we can sometimes still change PTEs
- * writable and avoid the write-fault handler, for
- * example, if a PTE is already dirty and no other
- * COW or special handling is required.
+ * Optimize for the small-folio common case by
+ * special-casing it here. Compiler constant propagation
+ * plus copious amounts of __always_inline does wonders.
*/
- if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent))
- set_write_prot_commit_flush_ptes(vma, folio, page,
- addr, pte, oldpte, ptent, nr_ptes, tlb);
- else
- prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
- nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ if (likely(nr_ptes == 1)) {
+ change_present_ptes(tlb, vma, addr, pte, 1,
+ end, newprot, folio, page, cp_flags);
+ } else {
+ change_present_ptes(tlb, vma, addr, pte,
+ nr_ptes, end, newprot, folio, page,
+ cp_flags);
+ }
+
pages += nr_ptes;
} else if (pte_none(oldpte)) {
/*
@@ -317,66 +406,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pages++;
}
} else {
- softleaf_t entry = softleaf_from_pte(oldpte);
- pte_t newpte;
-
- if (softleaf_is_migration_write(entry)) {
- const struct folio *folio = softleaf_to_folio(entry);
-
- /*
- * A protection check is difficult so
- * just be safe and disable write
- */
- if (folio_test_anon(folio))
- entry = make_readable_exclusive_migration_entry(
- swp_offset(entry));
- else
- entry = make_readable_migration_entry(swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- } else if (softleaf_is_device_private_write(entry)) {
- /*
- * We do not preserve soft-dirtiness. See
- * copy_nonpresent_pte() for explanation.
- */
- entry = make_readable_device_private_entry(
- swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
- } else if (softleaf_is_marker(entry)) {
- /*
- * Ignore error swap entries unconditionally,
- * because any access should sigbus/sigsegv
- * anyway.
- */
- if (softleaf_is_poison_marker(entry) ||
- softleaf_is_guard_marker(entry))
- continue;
- /*
- * If this is uffd-wp pte marker and we'd like
- * to unprotect it, drop it; the next page
- * fault will trigger without uffd trapping.
- */
- if (uffd_wp_resolve) {
- pte_clear(vma->vm_mm, addr, pte);
- pages++;
- }
- continue;
- } else {
- newpte = oldpte;
- }
-
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
-
- if (!pte_same(oldpte, newpte)) {
- set_pte_at(vma->vm_mm, addr, pte, newpte);
- pages++;
- }
+ pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags);
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
lazy_mmu_mode_disable();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 88cd53d4ba09..833f743f309f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1835,7 +1835,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
balance_domain_limits(mdtc, strictlimit);
}
- if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb))
+ if (!writeback_in_progress(wb) &&
+ (nr_dirty > gdtc->bg_thresh ||
+ (strictlimit && gdtc->wb_dirty > gdtc->wb_bg_thresh)))
wb_start_background_writeback(wb);
/*
@@ -1862,15 +1864,9 @@ free_running:
* Unconditionally start background writeback if it's not
* already in progress. We need to do this because the global
* dirty threshold check above (nr_dirty > gdtc->bg_thresh)
- * doesn't account for these cases:
- *
- * a) strictlimit BDIs: throttling is calculated using per-wb
- * thresholds. The per-wb threshold can be exceeded even when
- * nr_dirty < gdtc->bg_thresh
- *
- * b) memcg-based throttling: memcg uses its own dirty count and
- * thresholds and can trigger throttling even when global
- * nr_dirty < gdtc->bg_thresh
+ * doesn't account for the memcg-based throttling case. memcg
+ * uses its own dirty count and thresholds and can trigger
+ * throttling even when global nr_dirty < gdtc->bg_thresh
*
* Writeback needs to be started else the writer stalls in the
* throttle loop waiting for dirty pages to be written back
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 111b54df8a3c..227d58dc3de6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -297,11 +297,6 @@ int page_group_by_mobility_disabled __read_mostly;
*/
DEFINE_STATIC_KEY_TRUE(deferred_pages);
-static inline bool deferred_pages_enabled(void)
-{
- return static_branch_unlikely(&deferred_pages);
-}
-
/*
* deferred_grow_zone() is __init, but it is called from
* get_page_from_freelist() during early boot until deferred_pages permanently
@@ -314,11 +309,6 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
return deferred_grow_zone(zone, order);
}
#else
-static inline bool deferred_pages_enabled(void)
-{
- return false;
-}
-
static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
{
return false;
@@ -1252,10 +1242,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
union pgtag_ref_handle handle;
union codetag_ref ref;
- if (get_page_tag_ref(page, &ref, &handle)) {
+ if (likely(get_page_tag_ref(page, &ref, &handle))) {
alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
update_page_tag_ref(handle, &ref);
put_page_tag_ref(handle);
+ } else {
+ /*
+ * page_ext is not available yet, record the pfn so we can
+ * clear the tag ref later when page_ext is initialized.
+ */
+ alloc_tag_add_early_pfn(page_to_pfn(page));
+ if (task->alloc_tag)
+ alloc_tag_set_inaccurate(task->alloc_tag);
}
}
@@ -6211,42 +6209,6 @@ void adjust_managed_page_count(struct page *page, long count)
}
EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
-{
- void *pos;
- unsigned long pages = 0;
-
- start = (void *)PAGE_ALIGN((unsigned long)start);
- end = (void *)((unsigned long)end & PAGE_MASK);
- for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
- struct page *page = virt_to_page(pos);
- void *direct_map_addr;
-
- /*
- * 'direct_map_addr' might be different from 'pos'
- * because some architectures' virt_to_page()
- * work with aliases. Getting the direct map
- * address ensures that we get a _writeable_
- * alias for the memset().
- */
- direct_map_addr = page_address(page);
- /*
- * Perform a kasan-unchecked memset() since this memory
- * has not been initialized.
- */
- direct_map_addr = kasan_reset_tag(direct_map_addr);
- if ((unsigned int)poison <= 0xFF)
- memset(direct_map_addr, poison, PAGE_SIZE);
-
- free_reserved_page(page);
- }
-
- if (pages && s)
- pr_info("Freeing %s memory: %ldK\n", s, K(pages));
-
- return pages;
-}
-
void free_reserved_page(struct page *page)
{
clear_page_tag_ref(page);
@@ -7775,6 +7737,11 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
return NULL;
+
+ /* On UP, spin_trylock() always succeeds even when it is locked */
+ if (!IS_ENABLED(CONFIG_SMP) && in_nmi())
+ return NULL;
+
if (!pcp_allowed_order(order))
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index 330abc5ab7b4..70cea9e24d2f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -276,10 +276,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
goto out_unlock;
}
+
+ rcu_read_lock();
if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
+ rcu_read_unlock();
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
+ rcu_read_unlock();
__swap_writepage(folio, swap_plug);
return 0;
@@ -307,11 +311,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
- memcg = folio_memcg(folio);
- if (!memcg)
+ if (!folio_memcg_charged(folio))
return;
rcu_read_lock();
+ memcg = folio_memcg(folio);
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
@@ -493,7 +497,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
folio_mark_uptodate(folio);
folio_unlock(folio);
}
- count_vm_events(PSWPIN, sio->pages);
+ count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
} else {
for (p = 0; p < sio->pages; p++) {
struct folio *folio = page_folio(sio->bvec[p].bv_page);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 2708c2b3ac1f..53a8997ec043 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -151,9 +151,8 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
if (&init_mm == mm)
return;
- if (pte_user_accessible_page(pte, addr)) {
+ if (pte_user_accessible_page(mm, addr, pte))
page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
- }
}
EXPORT_SYMBOL(__page_table_check_pte_clear);
@@ -163,9 +162,8 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
if (&init_mm == mm)
return;
- if (pmd_user_accessible_page(pmd, addr)) {
+ if (pmd_user_accessible_page(mm, addr, pmd))
page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
- }
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
@@ -175,9 +173,8 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
if (&init_mm == mm)
return;
- if (pud_user_accessible_page(pud, addr)) {
+ if (pud_user_accessible_page(mm, addr, pud))
page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
- }
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
@@ -211,7 +208,7 @@ void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr,
for (i = 0; i < nr; i++)
__page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i));
- if (pte_user_accessible_page(pte, addr))
+ if (pte_user_accessible_page(mm, addr, pte))
page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
EXPORT_SYMBOL(__page_table_check_ptes_set);
@@ -241,7 +238,7 @@ void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr,
for (i = 0; i < nr; i++)
__page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i));
- if (pmd_user_accessible_page(pmd, addr))
+ if (pmd_user_accessible_page(mm, addr, pmd))
page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
EXPORT_SYMBOL(__page_table_check_pmds_set);
@@ -257,7 +254,7 @@ void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr,
for (i = 0; i < nr; i++)
__page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i));
- if (pud_user_accessible_page(pud, addr))
+ if (pud_user_accessible_page(mm, addr, pud))
page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
EXPORT_SYMBOL(__page_table_check_puds_set);
diff --git a/mm/percpu.c b/mm/percpu.c
index a2107bdebf0b..b0676b8054ed 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
return true;
objcg = current_obj_cgroup();
- if (!objcg)
+ if (!objcg || obj_cgroup_is_root(objcg))
return true;
if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 19bf77925fa1..3b5dc21b323c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3177,119 +3177,99 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
#endif /* CONFIG_TMPFS_QUOTA */
#ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
-{
- struct inode *inode = file_inode(dst_vma->vm_file);
- struct shmem_inode_info *info = SHMEM_I(inode);
+static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *inode = file_inode(vma->vm_file);
struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t pgoff = linear_page_index(vma, addr);
gfp_t gfp = mapping_gfp_mask(mapping);
- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- void *page_kaddr;
struct folio *folio;
- int ret;
- pgoff_t max_off;
- if (shmem_inode_acct_blocks(inode, 1)) {
- /*
- * We may have got a page, returned -ENOENT triggering a retry,
- * and now we find ourselves with -ENOMEM. Release the page, to
- * avoid a BUG_ON in our caller.
- */
- if (unlikely(*foliop)) {
- folio_put(*foliop);
- *foliop = NULL;
- }
- return -ENOMEM;
- }
-
- if (!*foliop) {
- ret = -ENOMEM;
- folio = shmem_alloc_folio(gfp, 0, info, pgoff);
- if (!folio)
- goto out_unacct_blocks;
+ if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)))
+ return NULL;
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
- page_kaddr = kmap_local_folio(folio, 0);
- /*
- * The read mmap_lock is held here. Despite the
- * mmap_lock being read recursive a deadlock is still
- * possible if a writer has taken a lock. For example:
- *
- * process A thread 1 takes read lock on own mmap_lock
- * process A thread 2 calls mmap, blocks taking write lock
- * process B thread 1 takes page fault, read lock on own mmap lock
- * process B thread 2 calls mmap, blocks taking write lock
- * process A thread 1 blocks taking read lock on process B
- * process B thread 1 blocks taking read lock on process A
- *
- * Disable page faults to prevent potential deadlock
- * and retry the copy outside the mmap_lock.
- */
- pagefault_disable();
- ret = copy_from_user(page_kaddr,
- (const void __user *)src_addr,
- PAGE_SIZE);
- pagefault_enable();
- kunmap_local(page_kaddr);
-
- /* fallback to copy_from_user outside mmap_lock */
- if (unlikely(ret)) {
- *foliop = folio;
- ret = -ENOENT;
- /* don't free the page */
- goto out_unacct_blocks;
- }
+ folio = shmem_alloc_folio(gfp, 0, info, pgoff);
+ if (!folio)
+ return NULL;
- flush_dcache_folio(folio);
- } else { /* ZEROPAGE */
- clear_user_highpage(&folio->page, dst_addr);
- }
- } else {
- folio = *foliop;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- *foliop = NULL;
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) {
+ folio_put(folio);
+ return NULL;
}
- VM_BUG_ON(folio_test_locked(folio));
- VM_BUG_ON(folio_test_swapbacked(folio));
+ return folio;
+}
+
+static int shmem_mfill_filemap_add(struct folio *folio,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t pgoff = linear_page_index(vma, addr);
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ int err;
+
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
- __folio_mark_uptodate(folio);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(pgoff >= max_off))
- goto out_release;
- ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
- if (ret)
- goto out_release;
- ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
- if (ret)
- goto out_release;
+ err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
+ if (err)
+ goto err_unlock;
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, flags);
- if (ret)
- goto out_delete_from_cache;
+ if (shmem_inode_acct_blocks(inode, 1)) {
+ err = -ENOMEM;
+ goto err_delete_from_cache;
+ }
+ folio_add_lru(folio);
shmem_recalc_inode(inode, 1, 0);
- folio_unlock(folio);
+
return 0;
-out_delete_from_cache:
+
+err_delete_from_cache:
filemap_remove_folio(folio);
-out_release:
+err_unlock:
folio_unlock(folio);
- folio_put(folio);
-out_unacct_blocks:
- shmem_inode_unacct_blocks(inode, 1);
- return ret;
+ return err;
}
+
+static void shmem_mfill_filemap_remove(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ filemap_remove_folio(folio);
+ shmem_recalc_inode(inode, 0, 0);
+ folio_unlock(folio);
+}
+
+static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff)
+{
+ struct folio *folio;
+ int err;
+
+ err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
+ if (err)
+ return ERR_PTR(err);
+
+ return folio;
+}
+
+static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags)
+{
+ return true;
+}
+
+static const struct vm_uffd_ops shmem_uffd_ops = {
+ .can_userfault = shmem_can_userfault,
+ .get_folio_noalloc = shmem_get_folio_noalloc,
+ .alloc_folio = shmem_mfill_folio_alloc,
+ .filemap_add = shmem_mfill_filemap_add,
+ .filemap_remove = shmem_mfill_filemap_remove,
+};
#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
@@ -5325,6 +5305,9 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &shmem_uffd_ops,
+#endif
};
static const struct vm_operations_struct shmem_anon_vm_ops = {
@@ -5334,6 +5317,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &shmem_uffd_ops,
+#endif
};
int shmem_init_fs_context(struct fs_context *fc)
diff --git a/mm/shrinker.c b/mm/shrinker.c
index c23086bccf4d..76b3f750cf65 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -288,14 +288,10 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
int nid, index, offset;
long nr;
- struct mem_cgroup *parent;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct shrinker_info *child_info, *parent_info;
struct shrinker_info_unit *child_unit, *parent_unit;
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
/* Prevent from concurrent shrinker_info expand */
mutex_lock(&shrinker_mutex);
for_each_node(nid) {
diff --git a/mm/slub.c b/mm/slub.c
index 92362eeb13e5..0baa906f39ab 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5339,6 +5339,10 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
return NULL;
+ /* On UP, spin_trylock() always succeeds even when it is locked */
+ if (!IS_ENABLED(CONFIG_SMP) && in_nmi())
+ return NULL;
+
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
@@ -6645,16 +6649,6 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags,
if (!kasan_check_byte(p))
return NULL;
- /*
- * If reallocation is not necessary (e. g. the new size is less
- * than the current allocated size), the current allocation will be
- * preserved unless __GFP_THISNODE is set. In the latter case a new
- * allocation on the requested node will be attempted.
- */
- if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
- nid != page_to_nid(virt_to_page(p)))
- goto alloc_new;
-
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
@@ -6673,6 +6667,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags,
}
}
+ /*
+ * If reallocation is not necessary (e. g. the new size is less
+ * than the current allocated size), the current allocation will be
+ * preserved unless __GFP_THISNODE is set. In the latter case a new
+ * allocation on the requested node will be attempted.
+ */
+ if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
+ nid != page_to_nid(virt_to_page(p)))
+ goto alloc_new;
+
/* If the old object doesn't fit, allocate a bigger one */
if (new_size > ks)
goto alloc_new;
@@ -6707,7 +6711,7 @@ alloc_new:
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
- memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
+ memcpy(ret, kasan_reset_tag(p), min(new_size, (size_t)(orig_size ?: ks)));
kasan_enable_current();
}
@@ -6941,7 +6945,7 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig
if (p) {
/* We already know that `p` is not a vmalloc address. */
kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
+ memcpy(n, kasan_reset_tag(p), min(size, ksize(p)));
kasan_enable_current();
kfree(p);
diff --git a/mm/sparse.c b/mm/sparse.c
index 007fd52c621e..effdac6b0ab1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -403,7 +403,6 @@ failed:
ms = __nr_to_section(pnum);
if (!preinited_vmemmap_section(ms))
ms->section_mem_map = 0;
- ms->section_mem_map = 0;
}
}
diff --git a/mm/swap.c b/mm/swap.c
index 78b4aa811fc6..5cc44f0de987 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio)
__page_cache_release(folio, &lruvec, &flags);
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
}
void __folio_put(struct folio *folio)
@@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
folios_put(fbatch);
}
@@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio)
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
unsigned int nr_io, unsigned int nr_rotated)
__releases(lruvec->lru_lock)
+ __releases(rcu)
{
unsigned long cost;
@@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
if (!cost) {
spin_unlock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
return;
}
@@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
spin_unlock_irq(&lruvec->lru_lock);
lruvec = parent_lruvec(lruvec);
- if (!lruvec)
+ if (!lruvec) {
+ rcu_read_unlock();
break;
+ }
spin_lock_irq(&lruvec->lru_lock);
}
}
@@ -349,7 +353,7 @@ void folio_activate(struct folio *folio)
lruvec = folio_lruvec_lock_irq(folio);
lru_activate(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
folio_set_lru(folio);
}
#endif
@@ -412,18 +416,20 @@ static void lru_gen_inc_refs(struct folio *folio)
static bool lru_gen_clear_refs(struct folio *folio)
{
- struct lru_gen_folio *lrugen;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
+ unsigned long seq;
if (gen < 0)
return true;
set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
- lrugen = &folio_lruvec(folio)->lrugen;
+ rcu_read_lock();
+ seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]);
+ rcu_read_unlock();
/* whether can do without shuffling under the LRU lock */
- return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
+ return gen == lru_gen_from_seq(seq);
}
#else /* !CONFIG_LRU_GEN */
@@ -963,7 +969,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
if (folio_is_zone_device(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
if (folio_ref_sub_and_test(folio, nr_refs))
@@ -977,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
/* hugetlb has its own memcg */
if (folio_test_hugetlb(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
free_huge_folio(folio);
@@ -991,7 +997,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
j++;
}
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
if (!j) {
folio_batch_reinit(folios);
return;
@@ -1084,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+#ifdef CONFIG_MEMCG
+static void lruvec_reparent_lru(struct lruvec *child_lruvec,
+ struct lruvec *parent_lruvec,
+ enum lru_list lru, int nid)
+{
+ int zid;
+ struct zone *zone;
+
+ if (lru != LRU_UNEVICTABLE)
+ list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
+
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+ unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+ mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+ }
+}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ enum lru_list lru;
+ struct lruvec *child_lruvec, *parent_lruvec;
+
+ child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ parent_lruvec->anon_cost += child_lruvec->anon_cost;
+ parent_lruvec->file_cost += child_lruvec->file_cost;
+
+ for_each_lru(lru)
+ lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
+}
+#endif
+
static const struct ctl_table swap_sysctl_table[] = {
{
.procname = "page-cluster",
diff --git a/mm/truncate.c b/mm/truncate.c
index 2931d66c16d0..12cc89f89afc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -622,6 +622,7 @@ static int folio_launder(struct address_space *mapping, struct folio *folio)
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
gfp_t gfp)
{
+ void (*free_folio)(struct folio *);
int ret;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -648,9 +649,12 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_lru_list_add(mapping->host);
+ free_folio = mapping->a_ops->free_folio;
spin_unlock(&mapping->host->i_lock);
- filemap_free_folio(mapping, folio);
+ if (free_folio)
+ free_folio(folio);
+ folio_put_refs(folio, folio_nr_pages(folio));
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 89879c3ba344..180bad42fc79 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,12 +14,61 @@
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
-#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
#include "swap.h"
+struct mfill_state {
+ struct userfaultfd_ctx *ctx;
+ unsigned long src_start;
+ unsigned long dst_start;
+ unsigned long len;
+ uffd_flags_t flags;
+
+ struct vm_area_struct *vma;
+ unsigned long src_addr;
+ unsigned long dst_addr;
+ pmd_t *pmd;
+};
+
+static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags)
+{
+ /* anonymous memory does not support MINOR mode */
+ if (vm_flags & VM_UFFD_MINOR)
+ return false;
+ return true;
+}
+
+static struct folio *anon_alloc_folio(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ addr);
+
+ if (!folio)
+ return NULL;
+
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+
+static const struct vm_uffd_ops anon_uffd_ops = {
+ .can_userfault = anon_can_userfault,
+ .alloc_folio = anon_alloc_folio,
+};
+
+static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return &anon_uffd_ops;
+ return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL;
+}
+
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
@@ -143,6 +192,128 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma)
}
#endif
+static void mfill_put_vma(struct mfill_state *state)
+{
+ if (!state->vma)
+ return;
+
+ up_read(&state->ctx->map_changing_lock);
+ uffd_mfill_unlock(state->vma);
+ state->vma = NULL;
+}
+
+static int mfill_get_vma(struct mfill_state *state)
+{
+ struct userfaultfd_ctx *ctx = state->ctx;
+ uffd_flags_t flags = state->flags;
+ struct vm_area_struct *dst_vma;
+ const struct vm_uffd_ops *ops;
+ int err;
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len);
+ if (IS_ERR(dst_vma))
+ return PTR_ERR(dst_vma);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ state->vma = dst_vma;
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out_unlock;
+
+ err = -EINVAL;
+
+ /*
+ * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+ * it will overwrite vm_ops, so vma_is_anonymous must return false.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+ dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+
+ /*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ if (is_vm_hugetlb_page(dst_vma))
+ return 0;
+
+ ops = vma_uffd_ops(dst_vma);
+ if (!ops)
+ goto out_unlock;
+
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+ !ops->get_folio_noalloc)
+ goto out_unlock;
+
+ return 0;
+
+out_unlock:
+ mfill_put_vma(state);
+ return err;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return NULL;
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ return pmd_alloc(mm, pud, address);
+}
+
+static int mfill_establish_pmd(struct mfill_state *state)
+{
+ struct mm_struct *dst_mm = state->ctx->mm;
+ pmd_t *dst_pmd, dst_pmdval;
+
+ dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr);
+ if (unlikely(!dst_pmd))
+ return -ENOMEM;
+
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_pmd)))
+ return -ENOMEM;
+
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval)))
+ return -EEXIST;
+ if (unlikely(pmd_bad(dst_pmdval)))
+ return -EFAULT;
+
+ state->pmd = dst_pmd;
+ return 0;
+}
+
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -165,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
* and anon, and for both shared and private VMAs.
*/
-int mfill_atomic_install_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr, struct page *page,
- bool newly_allocated, uffd_flags_t flags)
+static int mfill_atomic_install_pte(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ uffd_flags_t flags)
{
int ret;
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -212,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
goto out_unlock;
if (page_in_cache) {
- /* Usually, cache pages are already added to LRU */
- if (newly_allocated)
- folio_add_lru(folio);
folio_add_file_rmap_pte(folio, page, dst_vma);
} else {
folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
@@ -229,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ if (page_in_cache)
+ folio_unlock(folio);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
@@ -238,58 +409,110 @@ out:
return ret;
}
-static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr)
{
void *kaddr;
int ret;
+
+ kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
+ ret = copy_from_user(kaddr, (const void __user *) src_addr,
+ PAGE_SIZE);
+ pagefault_enable();
+ kunmap_local(kaddr);
+
+ if (ret)
+ return -EFAULT;
+
+ flush_dcache_folio(folio);
+ return ret;
+}
+
+static int mfill_copy_folio_retry(struct mfill_state *state,
+ struct folio *folio)
+{
+ const struct vm_uffd_ops *orig_ops = vma_uffd_ops(state->vma);
+ unsigned long src_addr = state->src_addr;
+ void *kaddr;
+ int err;
+
+ /* retry copying with mm_lock dropped */
+ mfill_put_vma(state);
+
+ kaddr = kmap_local_folio(folio, 0);
+ err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE);
+ kunmap_local(kaddr);
+ if (unlikely(err))
+ return -EFAULT;
+
+ flush_dcache_folio(folio);
+
+ /* reget VMA and PMD, they could change underneath us */
+ err = mfill_get_vma(state);
+ if (err)
+ return err;
+
+ /*
+ * The VMA type may have changed while the lock was dropped
+ * (e.g. replaced with a hugetlb mapping), making the caller's
+ * ops pointer stale.
+ */
+ if (vma_uffd_ops(state->vma) != orig_ops)
+ return -EAGAIN;
+
+ err = mfill_establish_pmd(state);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __mfill_atomic_pte(struct mfill_state *state,
+ const struct vm_uffd_ops *ops)
+{
+ unsigned long dst_addr = state->dst_addr;
+ unsigned long src_addr = state->src_addr;
+ uffd_flags_t flags = state->flags;
struct folio *folio;
+ int ret;
- if (!*foliop) {
- ret = -ENOMEM;
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
- dst_addr);
- if (!folio)
- goto out;
+ folio = ops->alloc_folio(state->vma, state->dst_addr);
+ if (!folio)
+ return -ENOMEM;
- kaddr = kmap_local_folio(folio, 0);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
+ ret = mfill_copy_folio_locked(folio, src_addr);
/*
- * The read mmap_lock is held here. Despite the
- * mmap_lock being read recursive a deadlock is still
- * possible if a writer has taken a lock. For example:
- *
- * process A thread 1 takes read lock on own mmap_lock
- * process A thread 2 calls mmap, blocks taking write lock
- * process B thread 1 takes page fault, read lock on own mmap lock
- * process B thread 2 calls mmap, blocks taking write lock
- * process A thread 1 blocks taking read lock on process B
- * process B thread 1 blocks taking read lock on process A
- *
- * Disable page faults to prevent potential deadlock
- * and retry the copy outside the mmap_lock.
+ * Fallback to copy_from_user outside mmap_lock.
+ * If retry is successful, mfill_copy_folio_locked() returns
+ * with locks retaken by mfill_get_vma().
+ * If there was an error, we must mfill_put_vma() anyway and it
+ * will take care of unlocking if needed.
*/
- pagefault_disable();
- ret = copy_from_user(kaddr, (const void __user *) src_addr,
- PAGE_SIZE);
- pagefault_enable();
- kunmap_local(kaddr);
-
- /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- ret = -ENOENT;
- *foliop = folio;
- /* don't free the page */
- goto out;
+ ret = mfill_copy_folio_retry(state, folio);
+ if (ret)
+ goto err_folio_put;
}
-
- flush_dcache_folio(folio);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+ clear_user_highpage(&folio->page, state->dst_addr);
} else {
- folio = *foliop;
- *foliop = NULL;
+ VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags);
}
/*
@@ -299,63 +522,65 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
*/
__folio_mark_uptodate(folio);
- ret = -ENOMEM;
- if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
- goto out_release;
+ if (ops->filemap_add) {
+ ret = ops->filemap_add(folio, state->vma, state->dst_addr);
+ if (ret)
+ goto err_folio_put;
+ }
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, flags);
+ ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr,
+ &folio->page, flags);
if (ret)
- goto out_release;
-out:
- return ret;
-out_release:
+ goto err_filemap_remove;
+
+ return 0;
+
+err_filemap_remove:
+ if (ops->filemap_remove)
+ ops->filemap_remove(folio, state->vma);
+err_folio_put:
folio_put(folio);
- goto out;
+ return ret;
}
-static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_copy(struct mfill_state *state)
{
- struct folio *folio;
- int ret = -ENOMEM;
-
- folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
- if (!folio)
- return ret;
-
- if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
- goto out_put;
+ const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma);
/*
- * The memory barrier inside __folio_mark_uptodate makes sure that
- * zeroing out the folio become visible before mapping the page
- * using set_pte_at(). See do_anonymous_page().
+ * The normal page fault path for a MAP_PRIVATE mapping in a
+ * file-backed VMA will invoke the fault, fill the hole in the file and
+ * COW it right away. The result generates plain anonymous memory.
+ * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll
+ * generate anonymous memory directly without actually filling the
+ * hole. For the MAP_PRIVATE case the robustness check only happens in
+ * the pagetable (to verify it's still none) and not in the page cache.
*/
- __folio_mark_uptodate(folio);
+ if (!(state->vma->vm_flags & VM_SHARED))
+ ops = &anon_uffd_ops;
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, 0);
- if (ret)
- goto out_put;
+ return __mfill_atomic_pte(state, ops);
+}
- return 0;
-out_put:
- folio_put(folio);
- return ret;
+static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state)
+{
+ const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma);
+
+ return __mfill_atomic_pte(state, ops);
}
-static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mfill_state *state)
{
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
- if (mm_forbids_zeropage(dst_vma->vm_mm))
- return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
+ if (mm_forbids_zeropage(dst_vma->vm_mm) ||
+ (dst_vma->vm_flags & VM_SHARED))
+ return mfill_atomic_pte_zeroed_folio(state);
_dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr),
dst_vma->vm_page_prot));
@@ -381,28 +606,29 @@ out:
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_continue(struct mfill_state *state)
{
- struct inode *inode = file_inode(dst_vma->vm_file);
+ struct vm_area_struct *dst_vma = state->vma;
+ const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma);
+ unsigned long dst_addr = state->dst_addr;
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ uffd_flags_t flags = state->flags;
+ pmd_t *dst_pmd = state->pmd;
struct folio *folio;
struct page *page;
int ret;
- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find folio */
- if (ret == -ENOENT)
- ret = -EFAULT;
- if (ret)
- goto out;
- if (!folio) {
- ret = -EFAULT;
- goto out;
+ if (!ops) {
+ VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA");
+ return -EOPNOTSUPP;
}
+ folio = ops->get_folio_noalloc(inode, pgoff);
+ /* Our caller expects us to return -EFAULT if we failed to find folio */
+ if (IS_ERR_OR_NULL(folio))
+ return -EFAULT;
+
page = folio_file_page(folio, pgoff);
if (PageHWPoison(page)) {
ret = -EIO;
@@ -410,30 +636,28 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
}
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- page, false, flags);
+ page, flags);
if (ret)
goto out_release;
- folio_unlock(folio);
- ret = 0;
-out:
- return ret;
+ return 0;
+
out_release:
folio_unlock(folio);
folio_put(folio);
- goto out;
+ return ret;
}
/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
-static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_poison(struct mfill_state *state)
{
- int ret;
+ struct vm_area_struct *dst_vma = state->vma;
struct mm_struct *dst_mm = dst_vma->vm_mm;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
+ int ret;
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
ret = -EAGAIN;
@@ -462,27 +686,6 @@ out:
return ret;
}
-static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
-
- pgd = pgd_offset(mm, address);
- p4d = p4d_alloc(mm, pgd, address);
- if (!p4d)
- return NULL;
- pud = pud_alloc(mm, p4d, address);
- if (!pud)
- return NULL;
- /*
- * Note that we didn't run this because the pmd was
- * missing, the *pmd may be already established and in
- * turn it may also be a trans_huge_pmd.
- */
- return pmd_alloc(mm, pud, address);
-}
-
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
@@ -657,48 +860,21 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state)
{
- ssize_t err;
-
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
- return mfill_atomic_pte_continue(dst_pmd, dst_vma,
- dst_addr, flags);
- } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
- return mfill_atomic_pte_poison(dst_pmd, dst_vma,
- dst_addr, flags);
- }
-
- /*
- * The normal page fault path for a shmem will invoke the
- * fault, fill the hole in the file and COW it right away. The
- * result generates plain anonymous memory. So when we are
- * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
- * generate anonymous memory directly without actually filling
- * the hole. For the MAP_PRIVATE case the robustness check
- * only happens in the pagetable (to verify it's still none)
- * and not in the radix tree.
- */
- if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
- err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
- dst_addr, src_addr,
- flags, foliop);
- else
- err = mfill_atomic_pte_zeropage(dst_pmd,
- dst_vma, dst_addr);
- } else {
- err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
- dst_addr, src_addr,
- flags, foliop);
- }
-
- return err;
+ uffd_flags_t flags = state->flags;
+
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
+ return mfill_atomic_pte_continue(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON))
+ return mfill_atomic_pte_poison(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
+ return mfill_atomic_pte_copy(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE))
+ return mfill_atomic_pte_zeropage(state);
+
+ VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags);
+ return -EOPNOTSUPP;
}
static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
@@ -707,13 +883,17 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long len,
uffd_flags_t flags)
{
- struct mm_struct *dst_mm = ctx->mm;
- struct vm_area_struct *dst_vma;
+ struct mfill_state state = (struct mfill_state){
+ .ctx = ctx,
+ .dst_start = dst_start,
+ .src_start = src_start,
+ .flags = flags,
+ .len = len,
+ .src_addr = src_start,
+ .dst_addr = dst_start,
+ };
+ long copied = 0;
ssize_t err;
- pmd_t *dst_pmd;
- unsigned long src_addr, dst_addr;
- long copied;
- struct folio *folio;
/*
* Sanitize the command parameters:
@@ -725,125 +905,35 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
VM_WARN_ON_ONCE(src_start + len <= src_start);
VM_WARN_ON_ONCE(dst_start + len <= dst_start);
- src_addr = src_start;
- dst_addr = dst_start;
- copied = 0;
- folio = NULL;
-retry:
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
- if (IS_ERR(dst_vma)) {
- err = PTR_ERR(dst_vma);
+ err = mfill_get_vma(&state);
+ if (err)
goto out;
- }
-
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- down_read(&ctx->map_changing_lock);
- err = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
- goto out_unlock;
-
- err = -EINVAL;
- /*
- * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
- * it will overwrite vm_ops, so vma_is_anonymous must return false.
- */
- if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
- dst_vma->vm_flags & VM_SHARED))
- goto out_unlock;
-
- /*
- * validate 'mode' now that we know the dst_vma: don't allow
- * a wrprotect copy if the userfaultfd didn't register as WP.
- */
- if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
- goto out_unlock;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
- if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ if (is_vm_hugetlb_page(state.vma))
+ return mfill_atomic_hugetlb(ctx, state.vma, dst_start,
src_start, len, flags);
- if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
- goto out_unlock;
- if (!vma_is_shmem(dst_vma) &&
- uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
- goto out_unlock;
-
- while (src_addr < src_start + len) {
- pmd_t dst_pmdval;
-
- VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
+ while (state.src_addr < src_start + len) {
+ VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
- dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
- if (unlikely(!dst_pmd)) {
- err = -ENOMEM;
+ err = mfill_establish_pmd(&state);
+ if (err)
break;
- }
- dst_pmdval = pmdp_get_lockless(dst_pmd);
- if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd))) {
- err = -ENOMEM;
- break;
- }
- dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is THP don't override it and just be strict.
- * (This includes the case where the PMD used to be THP and
- * changed back to none after __pte_alloc().)
- */
- if (unlikely(!pmd_present(dst_pmdval) ||
- pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
- if (unlikely(pmd_bad(dst_pmdval))) {
- err = -EFAULT;
- break;
- }
/*
* For shmem mappings, khugepaged is allowed to remove page
* tables under us; pte_offset_map_lock() will deal with that.
*/
- err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
- src_addr, flags, &folio);
+ err = mfill_atomic_pte(&state);
cond_resched();
- if (unlikely(err == -ENOENT)) {
- void *kaddr;
-
- up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
- VM_WARN_ON_ONCE(!folio);
-
- kaddr = kmap_local_folio(folio, 0);
- err = copy_from_user(kaddr,
- (const void __user *) src_addr,
- PAGE_SIZE);
- kunmap_local(kaddr);
- if (unlikely(err)) {
- err = -EFAULT;
- goto out;
- }
- flush_dcache_folio(folio);
- goto retry;
- } else
- VM_WARN_ON_ONCE(folio);
-
if (!err) {
- dst_addr += PAGE_SIZE;
- src_addr += PAGE_SIZE;
+ state.dst_addr += PAGE_SIZE;
+ state.src_addr += PAGE_SIZE;
copied += PAGE_SIZE;
if (fatal_signal_pending(current))
@@ -853,12 +943,8 @@ retry:
break;
}
-out_unlock:
- up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
+ mfill_put_vma(&state);
out:
- if (folio)
- folio_put(folio);
VM_WARN_ON_ONCE(copied < 0);
VM_WARN_ON_ONCE(err > 0);
VM_WARN_ON_ONCE(!copied && !err);
@@ -1938,6 +2024,38 @@ out:
return moved ? moved : err;
}
+bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
+ bool wp_async)
+{
+ const struct vm_uffd_ops *ops = vma_uffd_ops(vma);
+
+ if (vma->vm_flags & VM_DROPPABLE)
+ return false;
+
+ vm_flags &= __VM_UFFD_FLAGS;
+
+ /*
+ * If WP is the only mode enabled and context is wp async, allow any
+ * memory type.
+ */
+ if (wp_async && (vm_flags == VM_UFFD_WP))
+ return true;
+
+ /* For any other mode reject VMAs that don't implement vm_uffd_ops */
+ if (!ops)
+ return false;
+
+ /*
+ * If user requested uffd-wp but not enabled pte markers for
+ * uffd-wp, then only anonymous memory is supported
+ */
+ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
+ !vma_is_anonymous(vma))
+ return false;
+
+ return ops->can_userfault(vma, vm_flags);
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
diff --git a/mm/util.c b/mm/util.c
index f063fd4de1e8..3cc949a0b7ed 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1232,7 +1232,7 @@ int __compat_vma_mmap(struct vm_area_desc *desc,
/* Update the VMA from the descriptor. */
compat_set_vma_from_desc(vma, desc);
/* Complete any specified mmap actions. */
- return mmap_action_complete(vma, &desc->action);
+ return mmap_action_complete(vma, &desc->action, /*is_compat=*/true);
}
EXPORT_SYMBOL(__compat_vma_mmap);
@@ -1281,16 +1281,6 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
}
EXPORT_SYMBOL(compat_vma_mmap);
-int __vma_check_mmap_hook(struct vm_area_struct *vma)
-{
- /* vm_ops->mapped is not valid if mmap() is specified. */
- if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
- return -EINVAL;
-
- return 0;
-}
-EXPORT_SYMBOL(__vma_check_mmap_hook);
-
static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
const struct page *page)
{
@@ -1399,7 +1389,8 @@ static int call_vma_mapped(struct vm_area_struct *vma)
}
static int mmap_action_finish(struct vm_area_struct *vma,
- struct mmap_action *action, int err)
+ struct mmap_action *action, int err,
+ bool is_compat)
{
size_t len;
@@ -1410,8 +1401,12 @@ static int mmap_action_finish(struct vm_area_struct *vma,
/* do_munmap() might take rmap lock, so release if held. */
maybe_rmap_unlock_action(vma, action);
- if (!err)
- return 0;
+ /*
+ * If this is invoked from the compatibility layer, post-mmap() hook
+ * logic will handle cleanup for us.
+ */
+ if (!err || is_compat)
+ return err;
/*
* If an error occurs, unmap the VMA altogether and return an error. We
@@ -1461,13 +1456,15 @@ EXPORT_SYMBOL(mmap_action_prepare);
* mmap_action_complete - Execute VMA descriptor action.
* @vma: The VMA to perform the action upon.
* @action: The action to perform.
+ * @is_compat: Is this being invoked from the compatibility layer?
*
* Similar to mmap_action_prepare().
*
- * Return: 0 on success, or error, at which point the VMA will be unmapped.
+ * Return: 0 on success, or error, at which point the VMA will be unmapped if
+ * !@is_compat.
*/
int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action)
+ struct mmap_action *action, bool is_compat)
{
int err = 0;
@@ -1488,7 +1485,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
break;
}
- return mmap_action_finish(vma, action, err);
+ return mmap_action_finish(vma, action, err, is_compat);
}
EXPORT_SYMBOL(mmap_action_complete);
#else
@@ -1510,7 +1507,8 @@ int mmap_action_prepare(struct vm_area_desc *desc)
EXPORT_SYMBOL(mmap_action_prepare);
int mmap_action_complete(struct vm_area_struct *vma,
- struct mmap_action *action)
+ struct mmap_action *action,
+ bool is_compat)
{
int err = 0;
@@ -1527,7 +1525,7 @@ int mmap_action_complete(struct vm_area_struct *vma,
break;
}
- return mmap_action_finish(vma, action, err);
+ return mmap_action_finish(vma, action, err, is_compat);
}
EXPORT_SYMBOL(mmap_action_complete);
#endif
diff --git a/mm/vma.c b/mm/vma.c
index 377321b48734..d90791b00a7b 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2780,7 +2780,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
__mmap_complete(&map, vma);
if (have_mmap_prepare && allocated_new) {
- error = mmap_action_complete(vma, &desc.action);
+ error = mmap_action_complete(vma, &desc.action,
+ /*is_compat=*/false);
if (error)
return error;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b31b208f6ecb..c31a8615a832 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4361,7 +4361,7 @@ need_realloc:
return NULL;
if (p) {
- memcpy(n, p, old_size);
+ memcpy(n, p, min(size, old_size));
vfree(p);
}
@@ -5416,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct vmap_node *vn;
+ guard(mutex)(&vmap_purge_lock);
for_each_vmap_node(vn)
decay_va_pool_node(vn, true);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4bf091b1c8af..bd1b1aa12581 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
-/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
- * and including the specified highidx
- * @zone: The current zone in the iterator
- * @pgdat: The pgdat which node_zones are being iterated
- * @idx: The index variable
- * @highidx: The index of the highest zone to return
- *
- * This macro iterates through all managed zones up to and including the specified highidx.
- * The zone iterator enters an invalid state after macro call and must be reinitialized
- * before it can be used again.
- */
-#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
- for ((idx) = 0, (zone) = (pgdat)->node_zones; \
- (idx) <= (highidx); \
- (idx)++, (zone)++) \
- if (!managed_zone(zone)) \
- continue; \
- else
-
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -409,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
-static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zone_idx)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
unsigned long size = 0;
int zid;
@@ -1831,7 +1811,7 @@ bool folio_isolate_lru(struct folio *folio)
folio_get(folio);
lruvec = folio_lruvec_lock_irq(folio);
lruvec_del_folio(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
ret = true;
}
@@ -1885,24 +1865,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file,
/*
* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
*
- * Returns the number of pages moved to the given lruvec.
+ * Returns the number of pages moved to the appropriate lruvec.
+ *
+ * Note: The caller must not hold any lruvec lock.
*/
-static unsigned int move_folios_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct list_head *list)
{
int nr_pages, nr_moved = 0;
+ struct lruvec *lruvec = NULL;
struct folio_batch free_folios;
folio_batch_init(&free_folios);
while (!list_empty(list)) {
struct folio *folio = lru_to_folio(list);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
list_del(&folio->lru);
if (unlikely(!folio_evictable(folio))) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
folio_putback_lru(folio);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
continue;
}
@@ -1924,20 +1907,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
}
continue;
}
- /*
- * All pages were isolated from the same lruvec (and isolation
- * inhibits memcg migration).
- */
- VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
@@ -1945,11 +1923,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
workingset_age_nonresident(lruvec, nr_pages);
}
+ if (lruvec)
+ lruvec_unlock_irq(lruvec);
+
if (free_folios.nr) {
- spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
}
return nr_moved;
@@ -1998,7 +1977,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
&nr_scanned, sc, lru);
@@ -2008,7 +1987,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
mod_lruvec_state(lruvec, item, nr_scanned);
mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
if (nr_taken == 0)
return 0;
@@ -2016,16 +1995,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
lruvec_memcg(lruvec));
- spin_lock_irq(&lruvec->lru_lock);
- move_folios_to_lru(lruvec, &folio_list);
+ move_folios_to_lru(&folio_list);
mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
mod_lruvec_state(lruvec, item, nr_reclaimed);
mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed);
+ lruvec_lock_irq(lruvec);
lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
nr_scanned - nr_reclaimed);
@@ -2104,7 +2083,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2113,7 +2092,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
mod_lruvec_state(lruvec, PGREFILL, nr_scanned);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
while (!list_empty(&l_hold)) {
struct folio *folio;
@@ -2162,16 +2141,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move folios back to the lru list.
*/
- spin_lock_irq(&lruvec->lru_lock);
-
- nr_activate = move_folios_to_lru(lruvec, &l_active);
- nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
+ nr_activate = move_folios_to_lru(&l_active);
+ nr_deactivate = move_folios_to_lru(&l_inactive);
- __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ count_vm_events(PGDEACTIVATE, nr_deactivate);
count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
+ lruvec_lock_irq(lruvec);
lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
@@ -2886,8 +2863,9 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
return NULL;
clear_bit(key, &mm->lru_gen.bitmap);
+ mmgrab(mm);
- return mmget_not_zero(mm) ? mm : NULL;
+ return mm;
}
void lru_gen_add_mm(struct mm_struct *mm)
@@ -3087,7 +3065,7 @@ done:
reset_bloom_filter(mm_state, walk->seq + 1);
if (*iter)
- mmput_async(*iter);
+ mmdrop(*iter);
*iter = mm;
@@ -3442,8 +3420,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
if (folio_nid(folio) != pgdat->node_id)
return NULL;
+ rcu_read_lock();
if (folio_memcg(folio) != memcg)
- return NULL;
+ folio = NULL;
+ rcu_read_unlock();
return folio;
}
@@ -3803,9 +3783,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
}
if (walk->batched) {
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
reset_batch_size(walk);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
}
cond_resched();
@@ -3965,7 +3945,7 @@ restart:
if (seq < READ_ONCE(lrugen->max_seq))
return false;
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -3980,7 +3960,7 @@ restart:
if (inc_min_seq(lruvec, type, swappiness))
continue;
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
cond_resched();
goto restart;
}
@@ -4015,7 +3995,7 @@ restart:
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
unlock:
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
return success;
}
@@ -4213,12 +4193,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
struct pglist_data *pgdat = folio_pgdat(folio);
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
- DEFINE_MAX_SEQ(lruvec);
- int gen = lru_gen_from_seq(max_seq);
+ struct lruvec *lruvec;
+ struct lru_gen_mm_state *mm_state;
+ unsigned long max_seq;
+ int gen;
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4253,6 +4233,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
}
}
+ memcg = get_mem_cgroup_from_folio(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
+ gen = lru_gen_from_seq(max_seq);
+ mm_state = get_mm_state(lruvec);
+
lazy_mmu_mode_enable();
pte -= (addr - start) / PAGE_SIZE;
@@ -4302,6 +4288,8 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+ mem_cgroup_put(memcg);
+
return true;
}
@@ -4437,6 +4425,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
+}
+
+static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg,
+ struct lruvec *lruvec)
+{
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
+ int swappiness = mem_cgroup_swappiness(memcg);
+ DEFINE_MAX_SEQ(lruvec);
+ bool success = false;
+
+ /*
+ * We are not iterating the mm_list here, updating mm_state->seq is just
+ * to make mm walkers work properly.
+ */
+ if (mm_state) {
+ spin_lock(&mm_list->lock);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ if (max_seq > mm_state->seq) {
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ success = true;
+ }
+ spin_unlock(&mm_list->lock);
+ } else {
+ success = true;
+ }
+
+ if (success)
+ inc_max_seq(lruvec, max_seq, swappiness);
+}
+
+/*
+ * We need to ensure that the folios of child memcg can be reparented to the
+ * same gen of the parent memcg, so the gens of the parent memcg needed be
+ * incremented to the MAX_NR_GENS before reparenting.
+ */
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
+ try_to_inc_max_seq_nowalk(memcg, lruvec);
+ cond_resched();
+ }
+ }
+}
+
+/*
+ * Compared to traditional LRU, MGLRU faces the following challenges:
+ *
+ * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
+ * number of generations of the parent and child memcg may be different,
+ * so we cannot simply transfer MGLRU folios in the child memcg to the
+ * parent memcg as we did for traditional LRU folios.
+ * 2. The generation information is stored in folio->flags, but we cannot
+ * traverse these folios while holding the lru lock, otherwise it may
+ * cause softlockup.
+ * 3. In walk_update_folio(), the gen of folio and corresponding lru size
+ * may be updated, but the folio is not immediately moved to the
+ * corresponding lru list. Therefore, there may be folios of different
+ * generations on an LRU list.
+ * 4. In lru_gen_del_folio(), the generation to which the folio belongs is
+ * found based on the generation information in folio->flags, and the
+ * corresponding LRU size will be updated. Therefore, we need to update
+ * the lru size correctly during reparenting, otherwise the lru size may
+ * be updated incorrectly in lru_gen_del_folio().
+ *
+ * Finally, we choose a compromise method, which is to splice the lru list in
+ * the child memcg to the lru list of the same generation in the parent memcg
+ * during reparenting.
+ *
+ * The same generation has different meanings in the parent and child memcg,
+ * so this compromise method will cause the LRU inversion problem. But as the
+ * system runs, this problem will be fixed automatically.
+ */
+static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec,
+ int zone, int type)
+{
+ struct lru_gen_folio *child_lrugen, *parent_lrugen;
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ int i;
+
+ child_lrugen = &child_lruvec->lrugen;
+ parent_lrugen = &parent_lruvec->lrugen;
+
+ for (i = 0; i < get_nr_gens(child_lruvec, type); i++) {
+ int gen = lru_gen_from_seq(child_lrugen->max_seq - i);
+ long nr_pages = child_lrugen->nr_pages[gen][type][zone];
+ int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0;
+ int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0;
+
+ /* Assuming that child pages are colder than parent pages */
+ list_splice_tail_init(&child_lrugen->folios[gen][type][zone],
+ &parent_lrugen->folios[gen][type][zone]);
+
+ WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0);
+ WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone],
+ parent_lrugen->nr_pages[gen][type][zone] + nr_pages);
+
+ if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) {
+ __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages);
+ __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages);
+ }
+ }
+}
+
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ struct lruvec *child_lruvec, *parent_lruvec;
+ int type, zid;
+ struct zone *zone;
+ enum lru_list lru;
+
+ child_lruvec = get_lruvec(memcg, nid);
+ parent_lruvec = get_lruvec(parent, nid);
+
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1)
+ for (type = 0; type < ANON_AND_FILE; type++)
+ __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type);
+
+ for_each_lru(lru) {
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+ unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+ mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+ }
+ }
+}
+
#endif /* CONFIG_MEMCG */
/******************************************************************************
@@ -4630,7 +4760,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
static int get_tier_idx(struct lruvec *lruvec, int type)
{
int tier;
- struct ctrl_pos sp, pv;
+ struct ctrl_pos sp, pv = {};
/*
* To leave a margin for fluctuations, use a larger gain factor (2:3).
@@ -4649,7 +4779,7 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- struct ctrl_pos sp, pv;
+ struct ctrl_pos sp, pv = {};
if (swappiness <= MIN_SWAPPINESS + 1)
return LRU_GEN_FILE;
@@ -4707,7 +4837,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
@@ -4716,7 +4846,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
if (list_empty(&list))
return scanned;
@@ -4749,14 +4879,14 @@ retry:
set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
- spin_lock_irq(&lruvec->lru_lock);
-
- move_folios_to_lru(lruvec, &list);
+ move_folios_to_lru(&list);
walk = current->reclaim_state->mm_walk;
if (walk && walk->batched) {
walk->lruvec = lruvec;
+ lruvec_lock_irq(lruvec);
reset_batch_size(walk);
+ lruvec_unlock_irq(lruvec);
}
mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
@@ -4766,8 +4896,6 @@ retry:
mod_lruvec_state(lruvec, item, reclaimed);
mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
-
list_splice_init(&clean, &list);
if (!list_empty(&list)) {
@@ -4843,10 +4971,6 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
int i;
enum zone_watermarks mark;
- /* don't abort memcg reclaim to ensure fairness */
- if (!root_reclaim(sc))
- return false;
-
if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
return true;
@@ -4900,9 +5024,24 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* If too many file cache in the coldest generation can't be evicted
* due to being dirty, wake up the flusher.
*/
- if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) {
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -5196,7 +5335,7 @@ static void lru_gen_change_state(bool enabled)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
VM_WARN_ON_ONCE(!state_is_valid(lruvec));
@@ -5204,12 +5343,12 @@ static void lru_gen_change_state(bool enabled)
lruvec->lrugen.enabled = enabled;
while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
cond_resched();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
}
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
}
cond_resched();
@@ -7898,7 +8037,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c360c1b29ac9..f534972f517d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2141,7 +2141,7 @@ static void vmstat_shepherd(struct work_struct *w)
if (cpu_is_isolated(cpu))
continue;
- if (!delayed_work_pending(dw) && need_update(cpu))
+ if (!work_busy(&dw->work) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 37a94979900f..07e6836d0502 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -244,12 +244,15 @@ static void *lru_gen_eviction(struct folio *folio)
int refs = folio_lru_refs(folio);
bool workingset = folio_test_workingset(folio);
int tier = lru_tier_from_refs(refs, workingset);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
struct pglist_data *pgdat = folio_pgdat(folio);
+ unsigned short memcg_id;
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
@@ -257,8 +260,10 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+ memcg_id = mem_cgroup_private_id(memcg);
+ rcu_read_unlock();
- return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type);
+ return pack_shadow(memcg_id, pgdat, token, workingset, type);
}
/*
@@ -541,7 +546,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
void workingset_refault(struct folio *folio, void *shadow)
{
bool file = folio_is_file_lru(folio);
- struct pglist_data *pgdat;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
bool workingset;
@@ -564,14 +568,12 @@ void workingset_refault(struct folio *folio, void *shadow)
* locked to guarantee folio_memcg() stability throughout.
*/
nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- pgdat = folio_pgdat(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
+ memcg = get_mem_cgroup_from_folio(folio);
+ lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
if (!workingset_test_recent(shadow, file, &workingset, true))
- return;
+ goto out;
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
@@ -587,6 +589,8 @@ void workingset_refault(struct folio *folio, void *shadow)
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
+out:
+ mem_cgroup_put(memcg);
}
/**
@@ -599,8 +603,11 @@ void workingset_activation(struct folio *folio)
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
*/
- if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
+ rcu_read_lock();
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
+ rcu_read_unlock();
+ }
}
/*
@@ -684,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
mem_cgroup_flush_stats_ratelimited(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
- pages += lruvec_page_state_local(lruvec,
- NR_LRU_BASE + i);
+ pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);
+
pages += lruvec_page_state_local(
lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
pages += lruvec_page_state_local(
diff --git a/mm/zswap.c b/mm/zswap.c
index 0823cadd02b6..4b5149173b0e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -242,6 +242,34 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
+static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx)
+{
+ if (!acomp_ctx)
+ return;
+
+ /*
+ * If there was an error in allocating @acomp_ctx->req, it
+ * would be set to NULL.
+ */
+ if (acomp_ctx->req)
+ acomp_request_free(acomp_ctx->req);
+
+ acomp_ctx->req = NULL;
+
+ /*
+ * We have to handle both cases here: an error pointer return from
+ * crypto_alloc_acomp_node(); and a) NULL initialization by zswap, or
+ * b) NULL assignment done in a previous call to acomp_ctx_free().
+ */
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+
+ acomp_ctx->acomp = NULL;
+
+ kfree(acomp_ctx->buffer);
+ acomp_ctx->buffer = NULL;
+}
+
static struct zswap_pool *zswap_pool_create(char *compressor)
{
struct zswap_pool *pool;
@@ -263,19 +291,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
- pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+ /* Many things rely on the zero-initialization. */
+ pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx,
+ GFP_KERNEL | __GFP_ZERO);
if (!pool->acomp_ctx) {
pr_err("percpu alloc failed\n");
goto error;
}
- for_each_possible_cpu(cpu)
- mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
-
+ /*
+ * This is serialized against CPU hotplug operations. Hence, cores
+ * cannot be offlined until this finishes.
+ */
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
+
+ /*
+ * cpuhp_state_add_instance() will not cleanup on failure since
+ * we don't register a hotunplug callback.
+ */
if (ret)
- goto error;
+ goto cpuhp_add_fail;
/* being the current pool takes 1 ref; this func expects the
* caller to always add the new pool as the current pool
@@ -292,6 +328,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
ref_fail:
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+cpuhp_add_fail:
+ for_each_possible_cpu(cpu)
+ acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu));
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -322,9 +362,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int cpu;
+
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+ for_each_possible_cpu(cpu)
+ acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu));
+
free_percpu(pool->acomp_ctx);
zs_destroy_pool(pool->zs_pool);
@@ -664,8 +710,10 @@ void zswap_folio_swapin(struct folio *folio)
struct lruvec *lruvec;
if (folio) {
+ rcu_read_lock();
lruvec = folio_lruvec(folio);
atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
+ rcu_read_unlock();
}
}
@@ -736,39 +784,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp = NULL;
- struct acomp_req *req = NULL;
- u8 *buffer = NULL;
- int ret;
+ int ret = -ENOMEM;
- buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
- if (!buffer) {
- ret = -ENOMEM;
- goto fail;
+ /*
+ * To handle cases where the CPU goes through online-offline-online
+ * transitions, we return if the acomp_ctx has already been initialized.
+ */
+ if (acomp_ctx->acomp) {
+ WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp));
+ return 0;
}
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return ret;
+
+ /*
+ * In case of an error, crypto_alloc_acomp_node() returns an
+ * error pointer, never NULL.
+ */
+ acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %pe\n",
- pool->tfm_name, acomp);
- ret = PTR_ERR(acomp);
+ pool->tfm_name, acomp_ctx->acomp);
+ ret = PTR_ERR(acomp_ctx->acomp);
goto fail;
}
- req = acomp_request_alloc(acomp);
- if (!req) {
+ /* acomp_request_alloc() returns NULL in case of an error. */
+ acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!acomp_ctx->req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
- ret = -ENOMEM;
goto fail;
}
- /*
- * Only hold the mutex after completing allocations, otherwise we may
- * recurse into zswap through reclaim and attempt to hold the mutex
- * again resulting in a deadlock.
- */
- mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
/*
@@ -776,80 +826,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
* crypto_wait_req(); if the backend of acomp is scomp, the callback
* won't be called, crypto_wait_req() will return without blocking.
*/
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
- acomp_ctx->buffer = buffer;
- acomp_ctx->acomp = acomp;
- acomp_ctx->req = req;
- mutex_unlock(&acomp_ctx->mutex);
+ mutex_init(&acomp_ctx->mutex);
return 0;
fail:
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
+ acomp_ctx_free(acomp_ctx);
return ret;
}
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct acomp_req *req;
- struct crypto_acomp *acomp;
- u8 *buffer;
-
- if (IS_ERR_OR_NULL(acomp_ctx))
- return 0;
-
- mutex_lock(&acomp_ctx->mutex);
- req = acomp_ctx->req;
- acomp = acomp_ctx->acomp;
- buffer = acomp_ctx->buffer;
- acomp_ctx->req = NULL;
- acomp_ctx->acomp = NULL;
- acomp_ctx->buffer = NULL;
- mutex_unlock(&acomp_ctx->mutex);
-
- /*
- * Do the actual freeing after releasing the mutex to avoid subtle
- * locking dependencies causing deadlocks.
- */
- if (!IS_ERR_OR_NULL(req))
- acomp_request_free(req);
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
-
- return 0;
-}
-
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
-{
- struct crypto_acomp_ctx *acomp_ctx;
-
- for (;;) {
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
- if (likely(acomp_ctx->req))
- return acomp_ctx;
- /*
- * It is possible that we were migrated to a different CPU after
- * getting the per-CPU ctx but before the mutex was acquired. If
- * the old CPU got offlined, zswap_cpu_comp_dead() could have
- * already freed ctx->req (among other things) and set it to
- * NULL. Just try again on the new CPU that we ended up on.
- */
- mutex_unlock(&acomp_ctx->mutex);
- }
-}
-
-static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
-{
- mutex_unlock(&acomp_ctx->mutex);
-}
-
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -862,7 +849,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
u8 *dst;
bool mapped = false;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -893,11 +882,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* to the active LRU list in the case.
*/
if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ rcu_read_lock();
if (!mem_cgroup_zswap_writeback_enabled(
folio_memcg(page_folio(page)))) {
+ rcu_read_unlock();
comp_ret = comp_ret ? comp_ret : -EINVAL;
goto unlock;
}
+ rcu_read_unlock();
comp_ret = 0;
dlen = PAGE_SIZE;
dst = kmap_local_page(page);
@@ -925,7 +917,7 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -937,7 +929,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
int ret = 0, dlen;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length);
/* zswap entries of length PAGE_SIZE are not compressed. */
@@ -962,7 +955,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
}
zs_obj_read_sg_end(pool->zs_pool, entry->handle);
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
if (!ret && dlen == PAGE_SIZE)
return true;
@@ -1782,7 +1775,7 @@ static int zswap_setup(void)
ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
"mm/zswap_pool:prepare",
zswap_cpu_comp_prepare,
- zswap_cpu_comp_dead);
+ NULL);
if (ret)
goto hp_fail;