summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig37
-rw-r--r--mm/cma.c28
-rw-r--r--mm/cma.h5
-rw-r--r--mm/cma_sysfs.c15
-rw-r--r--mm/compaction.c81
-rw-r--r--mm/damon/Kconfig7
-rw-r--r--mm/damon/dbgfs.c26
-rw-r--r--mm/damon/sysfs-common.h2
-rw-r--r--mm/damon/sysfs-schemes.c22
-rw-r--r--mm/damon/sysfs.c21
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/huge_memory.c191
-rw-r--r--mm/hugetlb.c64
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/kasan_test.c82
-rw-r--r--mm/kasan/kasan_test_module.c4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c4
-rw-r--r--mm/kmsan/hooks.c36
-rw-r--r--mm/list_lru.c20
-rw-r--r--mm/memcontrol.c97
-rw-r--r--mm/memory-tiers.c26
-rw-r--r--mm/memory.c446
-rw-r--r--mm/memory_hotplug.c34
-rw-r--r--mm/mempolicy.c492
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mmap.c110
-rw-r--r--mm/mmu_gather.c111
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/ptdump.c22
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c15
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/userfaultfd.c484
-rw-r--r--mm/vmscan.c165
-rw-r--r--mm/zswap.c1697
42 files changed, 2792 insertions, 1686 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8c..b924f4a5a3ef 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.
-config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
- bool "Invalidate zswap entries when pages are loaded"
- depends on ZSWAP
- help
- If selected, exclusive loads for zswap will be enabled at boot,
- otherwise it will be disabled.
-
- If exclusive loads are enabled, when a page is loaded from zswap,
- the zswap entry is invalidated at once, as opposed to leaving it
- in zswap until the swap entry is freed.
-
- This avoids having two copies of the same page in memory
- (compressed and uncompressed) after faulting in a page from zswap.
- The cost is that if the page was never dirtied and needs to be
- swapped out again, it will be re-compressed.
-
config ZSWAP_SHRINKER_DEFAULT_ON
bool "Shrink the zswap pool on memory pressure"
depends on ZSWAP
@@ -901,15 +885,6 @@ config CMA
If unsure, say "n".
-config CMA_DEBUG
- bool "CMA debug messages (DEVELOPMENT)"
- depends on DEBUG_KERNEL && CMA
- help
- Turns on debug messages in CMA. This produces KERN_DEBUG
- messages for every CMA call as well as various messages while
- processing calls such as dma_alloc_from_contiguous().
- This option does not affect warning and error messages.
-
config CMA_DEBUGFS
bool "CMA debugfs interface"
depends on CMA && DEBUG_FS
@@ -926,14 +901,14 @@ config CMA_SYSFS
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
- default 19 if NUMA
- default 7
+ default 20 if NUMA
+ default 8
help
CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum
number of CMA area in the system.
- If unsure, leave the default value "7" in UMA and "19" in NUMA.
+ If unsure, leave the default value "8" in UMA and "20" in NUMA.
config MEM_SOFT_DIRTY
bool "Track memory changes"
@@ -998,6 +973,12 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+# Architectures which implement cpu_dcache_is_aliasing() to query
+# whether the data caches are aliased (VIVT or VIPT with dcache
+# aliasing) need to select this.
+config ARCH_HAS_CPU_CACHE_ALIASING
+ bool
+
config ARCH_HAS_CACHE_LINE_SIZE
bool
diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e530b..01f5a8f71ddf 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -14,11 +14,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
#define CREATE_TRACE_POINTS
#include <linux/memblock.h>
@@ -387,7 +382,6 @@ err:
return ret;
}
-#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
unsigned long next_zero_bit, next_set_bit, nr_zero;
@@ -412,9 +406,6 @@ static void cma_debug_show_areas(struct cma *cma)
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
spin_unlock_irq(&cma->lock);
}
-#else
-static inline void cma_debug_show_areas(struct cma *cma) { }
-#endif
/**
* cma_alloc() - allocate pages from contiguous area
@@ -436,17 +427,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
+ const char *name = cma ? cma->name : NULL;
+
+ trace_cma_alloc_start(name, count, align);
if (!cma || !cma->count || !cma->bitmap)
- goto out;
+ return page;
pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
(void *)cma, cma->name, count, align);
if (!count)
- goto out;
-
- trace_cma_alloc_start(cma->name, count, align);
+ return page;
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -454,7 +446,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- goto out;
+ return page;
for (;;) {
spin_lock_irq(&cma->lock);
@@ -496,8 +488,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
-
/*
* CMA can allocate multiple page blocks, which results in different
* blocks being marked with different tags. Reset the tags to ignore
@@ -515,14 +505,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
-out:
+ trace_cma_alloc_finish(name, pfn, page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
} else {
count_vm_event(CMA_ALLOC_FAIL);
- if (cma)
- cma_sysfs_account_fail_pages(cma, count);
+ cma_sysfs_account_fail_pages(cma, count);
}
return page;
@@ -573,6 +562,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
+ cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
return true;
diff --git a/mm/cma.h b/mm/cma.h
index 88a0595670b7..ad61cc6dd439 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -27,6 +27,8 @@ struct cma {
atomic64_t nr_pages_succeeded;
/* the number of CMA page allocation failures */
atomic64_t nr_pages_failed;
+ /* the number of CMA page released */
+ atomic64_t nr_pages_released;
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
@@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
#ifdef CONFIG_CMA_SYSFS
void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages);
#else
static inline void cma_sysfs_account_success_pages(struct cma *cma,
unsigned long nr_pages) {};
static inline void cma_sysfs_account_fail_pages(struct cma *cma,
unsigned long nr_pages) {};
+static inline void cma_sysfs_account_release_pages(struct cma *cma,
+ unsigned long nr_pages) {};
#endif
#endif
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index 56347d15b7e8..f50db3973171 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
atomic64_add(nr_pages, &cma->nr_pages_failed);
}
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_released);
+}
+
static inline struct cma *cma_from_kobj(struct kobject *kobj)
{
return container_of(kobj, struct cma_kobject, kobj)->cma;
@@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj,
}
CMA_ATTR_RO(alloc_pages_fail);
+static ssize_t release_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released));
+}
+CMA_ATTR_RO(release_pages_success);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj)
static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
+ &release_pages_success_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4add68d40e8d..f146478b01bc 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1365,12 +1365,14 @@ static bool suitable_migration_target(struct compact_control *cc,
{
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
+ int order = cc->order > 0 ? cc->order : pageblock_order;
+
/*
* We are checking page_order without zone->lock taken. But
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (buddy_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= order)
return false;
}
@@ -1796,6 +1798,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
dst = list_entry(cc->freepages.next, struct folio, lru);
list_del(&dst->lru);
cc->nr_freepages--;
+ cc->nr_migratepages--;
return dst;
}
@@ -1811,6 +1814,7 @@ static void compaction_free(struct folio *dst, unsigned long data)
list_add(&dst->lru, &cc->freepages);
cc->nr_freepages++;
+ cc->nr_migratepages++;
}
/* possible outcome of isolate_migratepages */
@@ -2433,7 +2437,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
- unsigned int nr_succeeded = 0;
+ unsigned int nr_succeeded = 0, nr_migratepages;
/*
* These counters track activities during zone compaction. Initialize
@@ -2551,11 +2555,17 @@ rescan:
pageblock_start_pfn(cc->migrate_pfn - 1));
}
+ /*
+ * Record the number of pages to migrate since the
+ * compaction_alloc/free() will update cc->nr_migratepages
+ * properly.
+ */
+ nr_migratepages = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc, nr_succeeded);
+ trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2788,25 +2798,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * Compact all zones within a node till each zone's fragmentation score
- * reaches within proactive compaction thresholds (as determined by the
- * proactiveness tunable).
+ * compact_node() - compact all zones within a node
+ * @pgdat: The node page data
+ * @proactive: Whether the compaction is proactive
*
- * It is possible that the function returns before reaching score targets
- * due to various back-off conditions, such as, contention on per-node or
- * per-zone locks.
+ * For proactive compaction, compact till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable), it is possible that the function returns before
+ * reaching score targets due to various back-off conditions, such as,
+ * contention on per-node or per-zone locks.
*/
-static void proactive_compact_node(pg_data_t *pgdat)
+static void compact_node(pg_data_t *pgdat, bool proactive)
{
int zoneid;
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .mode = MIGRATE_SYNC_LIGHT,
+ .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
.gfp_mask = GFP_KERNEL,
- .proactive_compaction = true,
+ .proactive_compaction = proactive,
};
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -2818,41 +2830,16 @@ static void proactive_compact_node(pg_data_t *pgdat)
compact_zone(&cc, NULL);
- count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
- cc.total_migrate_scanned);
- count_compact_events(KCOMPACTD_FREE_SCANNED,
- cc.total_free_scanned);
- }
-}
-
-/* Compact all zones within a node */
-static void compact_node(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
- int zoneid;
- struct zone *zone;
- struct compact_control cc = {
- .order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- .whole_zone = true,
- .gfp_mask = GFP_KERNEL,
- };
-
-
- for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-
- zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
- continue;
-
- cc.zone = zone;
-
- compact_zone(&cc, NULL);
+ if (proactive) {
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+ }
}
}
-/* Compact all nodes in the system */
+/* Compact all zones of all nodes in the system */
static void compact_nodes(void)
{
int nid;
@@ -2861,7 +2848,7 @@ static void compact_nodes(void)
lru_add_drain_all();
for_each_online_node(nid)
- compact_node(nid);
+ compact_node(NODE_DATA(nid), false);
}
static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
@@ -2923,7 +2910,7 @@ static ssize_t compact_store(struct device *dev,
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
- compact_node(nid);
+ compact_node(NODE_DATA(nid), false);
}
return count;
@@ -3132,7 +3119,7 @@ static int kcompactd(void *p)
unsigned int prev_score, score;
prev_score = fragmentation_score_node(pgdat);
- proactive_compact_node(pgdat);
+ compact_node(pgdat, true);
score = fragmentation_score_node(pgdat);
/*
* Defer proactive compaction if the fragmentation
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 29f43fbc2eff..fecb8172410c 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS
+config DAMON_DBGFS_DEPRECATED
bool "DAMON debugfs interface (DEPRECATED!)"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
@@ -84,6 +84,11 @@ config DAMON_DBGFS
(DAMON_SYSFS). If you depend on this and cannot move, please report
your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
+config DAMON_DBGFS
+ bool
+ default y
+ depends on DAMON_DBGFS_DEPRECATED
+
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_DBGFS && KUNIT=y
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e3b..2461cfe2e968 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,6 +15,11 @@
#include <linux/page_idle.h>
#include <linux/slab.h>
+#define DAMON_DBGFS_DEPRECATION_NOTICE \
+ "DAMON debugfs interface is deprecated, so users should move " \
+ "to DAMON_SYSFS. If you cannot, please report your usecase to " \
+ "damon@lists.linux.dev and linux-mm@kvack.org.\n"
+
static struct damon_ctx **dbgfs_ctxs;
static int dbgfs_nr_ctxs;
static struct dentry **dbgfs_dirs;
@@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock);
static void damon_dbgfs_warn_deprecation(void)
{
- pr_warn_once("DAMON debugfs interface is deprecated, "
- "so users should move to DAMON_SYSFS. If you cannot, "
- "please report your usecase to damon@lists.linux.dev and "
- "linux-mm@kvack.org.\n");
+ pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
}
/*
@@ -805,6 +807,14 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
damon_destroy_ctx(ctx);
}
+static ssize_t damon_dbgfs_deprecated_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
+
+ return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
+}
+
/*
* Make a context of @name and create a debugfs directory for it.
*
@@ -1056,6 +1066,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
return nonseekable_open(inode, file);
}
+static const struct file_operations deprecated_fops = {
+ .read = damon_dbgfs_deprecated_read,
+};
+
static const struct file_operations mk_contexts_fops = {
.open = damon_dbgfs_static_file_open,
.write = dbgfs_mk_context_write,
@@ -1076,9 +1090,9 @@ static int __init __damon_dbgfs_init(void)
{
struct dentry *dbgfs_root;
const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on"};
+ "monitor_on_DEPRECATED", "DEPRECATED"};
const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops};
+ &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
int i;
dbgfs_root = debugfs_create_dir("damon", NULL);
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4c37a166eb81..ec0703e1e90b 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx, bool total_bytes_only);
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
+
bool damos_sysfs_regions_upd_done(void);
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index ae0f0b314f3a..f6c7f43f06cc 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
*
* Once the tried regions update request is received, the request handling
* start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply() and
- * ->after_sampling() callbacks.
+ * of all schemes as 'idle' again, and register ->before_damos_apply()
+ * callback.
*
* Then, the first followup ->before_damos_apply() callback
* (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
- * is called only after the scheme is completely applied
- * to the given snapshot. Hence the callback knows the situation by showing
- * 'started' status, and sets the status as 'finished'. Then,
- * damon_sysfs_before_damos_apply() understands the situation by showing the
- * 'finished' status and do nothing.
+ * ->after_sampling() or ->after_aggregation() callback
+ * (damon_sysfs_cmd_request_callback()) after the call is called only after
+ * the scheme is completely applied to the given snapshot. Hence the callback
+ * knows the situation by showing 'started' status, and sets the status as
+ * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
+ * situation by showing the 'finished' status and do nothing.
*
* If DAMOS is not applied to any region due to any reasons including the
* access pattern, the watermarks, the quotas, and the filters,
@@ -2122,7 +2122,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
* callback is registered, damon_sysfs_lock should be held to ensure the
* regions directories exist.
*/
-static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
{
struct damon_sysfs_schemes *sysfs_schemes =
damon_sysfs_schemes_for_damos_callback;
@@ -2138,8 +2138,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
sysfs_regions->upd_status =
DAMOS_TRIED_REGIONS_UPD_FINISHED;
}
-
- return 0;
}
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
@@ -2212,7 +2210,6 @@ int damon_sysfs_schemes_update_regions_start(
damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
return 0;
}
@@ -2241,7 +2238,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
{
damon_sysfs_schemes_for_damos_callback = NULL;
ctx->callback.before_damos_apply = NULL;
- ctx->callback.after_sampling = NULL;
damon_sysfs_schemes_region_idx = 0;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1f891e18b4ee..678de97fcc88 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1379,11 +1379,13 @@ static int damon_sysfs_commit_schemes_quota_goals(
* damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
* @c: The DAMON context of the callback.
* @active: Whether @c is not deactivated due to watermarks.
+ * @after_aggr: Whether this is called from after_aggregation() callback.
*
* This function is periodically called back from the kdamond thread for @c.
* Then, it checks if there is a waiting DAMON sysfs request and handles it.
*/
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
+ bool after_aggregation)
{
struct damon_sysfs_kdamond *kdamond;
bool total_bytes_only = false;
@@ -1401,6 +1403,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
err = damon_sysfs_upd_schemes_stats(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT:
+ if (!after_aggregation)
+ goto out;
err = damon_sysfs_commit_input(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
@@ -1418,6 +1422,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
goto keep_lock_out;
}
} else {
+ damos_sysfs_mark_finished_regions_updates(c);
/*
* Continue regions updating if DAMON is till
* active and the update for all schemes is not
@@ -1450,7 +1455,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
* after_wmarks_check() is called back while the context is deactivated
* by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, false);
+ return damon_sysfs_cmd_request_callback(c, false, false);
+}
+
+static int damon_sysfs_after_sampling(struct damon_ctx *c)
+{
+ /*
+ * after_sampling() is called back only while the context is not
+ * deactivated by watermarks.
+ */
+ return damon_sysfs_cmd_request_callback(c, true, false);
}
static int damon_sysfs_after_aggregation(struct damon_ctx *c)
@@ -1459,7 +1473,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c)
* after_aggregation() is called back only while the context is not
* deactivated by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, true);
+ return damon_sysfs_cmd_request_callback(c, true, true);
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1478,6 +1492,7 @@ static struct damon_ctx *damon_sysfs_build_ctx(
}
ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+ ctx->callback.after_sampling = damon_sysfs_after_sampling;
ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a30de98a8c7..b7a21551fbc7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- int huge = folio_test_hugetlb(folio);
+ bool huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
@@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94c958f7ebb5..28341a5067fb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
- struct page *page = NULL;
+ struct folio *folio = NULL;
int flush_needed = 1;
if (pmd_present(orig_pmd)) {
- page = pmd_page(orig_pmd);
- folio_remove_rmap_pmd(page_folio(page), page, vma);
+ struct page *page = pmd_page(orig_pmd);
+
+ folio = page_folio(page);
+ folio_remove_rmap_pmd(folio, page, vma);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(folio),
+ -HPAGE_PMD_NR);
}
spin_unlock(ptl);
if (flush_needed)
- tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -2045,7 +2048,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@@ -2155,7 +2158,7 @@ unlock:
#ifdef CONFIG_USERFAULTFD
/*
- * The PT lock for src_pmd and the mmap_lock for reading are held by
+ * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
* the caller, but it must return after releasing the page_table_lock.
* Just move the page from src_pmd to dst_pmd if possible.
* Return zero if succeeded in moving the page, -EAGAIN if it needs to be
@@ -2178,7 +2181,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_ptl = pmd_lockptr(mm, src_pmd);
lockdep_assert_held(src_ptl);
- mmap_assert_locked(mm);
+ vma_assert_locked(src_vma);
+ vma_assert_locked(dst_vma);
/* Sanity checks before the operation */
if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
@@ -2197,13 +2201,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
}
src_page = pmd_page(src_pmdval);
- if (unlikely(!PageAnonExclusive(src_page))) {
- spin_unlock(src_ptl);
- return -EBUSY;
- }
- src_folio = page_folio(src_page);
- folio_get(src_folio);
+ if (!is_huge_zero_pmd(src_pmdval)) {
+ if (unlikely(!PageAnonExclusive(src_page))) {
+ spin_unlock(src_ptl);
+ return -EBUSY;
+ }
+
+ src_folio = page_folio(src_page);
+ folio_get(src_folio);
+ } else
+ src_folio = NULL;
+
spin_unlock(src_ptl);
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
@@ -2211,19 +2220,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- folio_lock(src_folio);
+ if (src_folio) {
+ folio_lock(src_folio);
- /*
- * split_huge_page walks the anon_vma chain without the page
- * lock. Serialize against it with the anon_vma lock, the page
- * lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- err = -EAGAIN;
- goto unlock_folio;
- }
- anon_vma_lock_write(src_anon_vma);
+ /*
+ * split_huge_page walks the anon_vma chain without the page
+ * lock. Serialize against it with the anon_vma lock, the page
+ * lock is not enough.
+ */
+ src_anon_vma = folio_get_anon_vma(src_folio);
+ if (!src_anon_vma) {
+ err = -EAGAIN;
+ goto unlock_folio;
+ }
+ anon_vma_lock_write(src_anon_vma);
+ } else
+ src_anon_vma = NULL;
dst_ptl = pmd_lockptr(mm, dst_pmd);
double_pt_lock(src_ptl, dst_ptl);
@@ -2232,45 +2244,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
err = -EAGAIN;
goto unlock_ptls;
}
- if (folio_maybe_dma_pinned(src_folio) ||
- !PageAnonExclusive(&src_folio->page)) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (src_folio) {
+ if (folio_maybe_dma_pinned(src_folio) ||
+ !PageAnonExclusive(&src_folio->page)) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
- WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
+ WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+ folio_move_anon_rmap(src_folio, dst_vma);
+ WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
- src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
- err = -EBUSY;
- goto unlock_ptls;
- }
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ /* Follow mremap() behavior and treat the entry dirty after the move */
+ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ } else {
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ }
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
double_pt_unlock(src_ptl, dst_ptl);
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
+ if (src_anon_vma) {
+ anon_vma_unlock_write(src_anon_vma);
+ put_anon_vma(src_anon_vma);
+ }
unlock_folio:
/* unblock rmap walks */
- folio_unlock(src_folio);
+ if (src_folio)
+ folio_unlock(src_folio);
mmu_notifier_invalidate_range_end(&range);
- folio_put(src_folio);
+ if (src_folio)
+ folio_put(src_folio);
return err;
}
#endif /* CONFIG_USERFAULTFD */
@@ -2442,7 +2463,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -2453,7 +2474,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
folio_remove_rmap_pmd(folio, page, vma);
folio_put(folio);
}
- add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
return;
}
@@ -2559,15 +2580,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte);
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- if (freeze || pmd_migration) {
+
+ /*
+ * Note that NUMA hinting access restrictions are not transferred to
+ * avoid any possibility of altering permissions across VMAs.
+ */
+ if (freeze || pmd_migration) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
swp_entry_t swp_entry;
+
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
@@ -2586,25 +2608,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
- } else {
- entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- if (write)
- entry = pte_mkwrite(entry, vma);
- if (!young)
- entry = pte_mkold(entry);
- /* NOTE: this may set soft-dirty too on some archs */
- if (dirty)
- entry = pte_mkdirty(entry);
- if (soft_dirty)
- entry = pte_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_mkuffd_wp(entry);
+
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
}
- VM_BUG_ON(!pte_none(ptep_get(pte)));
- set_pte_at(mm, addr, pte, entry);
- pte++;
+ } else {
+ pte_t entry;
+
+ entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
+ if (write)
+ entry = pte_mkwrite(entry, vma);
+ if (!young)
+ entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
+ if (soft_dirty)
+ entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+
+ set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
}
- pte_unmap(pte - 1);
+ pte_unmap(pte);
if (!pmd_migration)
folio_remove_rmap_pmd(folio, page, vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ed1581b670d4..c53a41d07cd3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3029,21 +3029,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = folio_nid(old_folio);
- struct folio *new_folio;
+ struct folio *new_folio = NULL;
int ret = 0;
- /*
- * Before dissolving the folio, we need to allocate a new one for the
- * pool to remain stable. Here, we allocate the folio and 'prep' it
- * by doing everything but actually updating counters and adding to
- * the pool. This simplifies and let us do most of the processing
- * under the lock.
- */
- new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
- if (!new_folio)
- return -ENOMEM;
- __prep_new_hugetlb_folio(h, new_folio);
-
retry:
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
@@ -3073,6 +3061,16 @@ retry:
cond_resched();
goto retry;
} else {
+ if (!new_folio) {
+ spin_unlock_irq(&hugetlb_lock);
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
+ NULL, NULL);
+ if (!new_folio)
+ return -ENOMEM;
+ __prep_new_hugetlb_folio(h, new_folio);
+ goto retry;
+ }
+
/*
* Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
@@ -3100,9 +3098,11 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
- /* Folio has a zero ref count, but needs a ref to be freed */
- folio_ref_unfreeze(new_folio, 1);
- update_and_free_hugetlb_folio(h, new_folio, false);
+ if (new_folio) {
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
+ }
return ret;
}
@@ -5585,6 +5585,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ bool adjust_reservation = false;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5677,7 +5678,31 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
hugetlb_count_sub(pages_per_huge_page(h), mm);
hugetlb_remove_rmap(page_folio(page));
+ /*
+ * Restore the reservation for anonymous page, otherwise the
+ * backing page could be stolen by someone.
+ * If there we are freeing a surplus, do not set the restore
+ * reservation bit.
+ */
+ if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
+ folio_test_anon(page_folio(page))) {
+ folio_set_hugetlb_restore_reserve(page_folio(page));
+ /* Reservation to be adjusted after the spin lock */
+ adjust_reservation = true;
+ }
+
spin_unlock(ptl);
+
+ /*
+ * Adjust the reservation for the region that will have the
+ * reserve restored. Keep in mind that vma_needs_reservation() changes
+ * resv->adds_in_progress if it succeeds. If this is not done,
+ * do_exit() will not see it, and will keep the reservation
+ * forever.
+ */
+ if (adjust_reservation && vma_needs_reservation(h, vma, address))
+ vma_add_reservation(h, vma, address);
+
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
* Bail out after unmapping reference page if supplied
@@ -7695,6 +7720,13 @@ void __init hugetlb_cma_reserve(int order)
bool node_specific_cma_alloc = false;
int nid;
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
cma_reserve_called = true;
if (!hugetlb_cma_size)
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..1e29c5821a1d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
extern bool mirrored_kernelcore;
extern bool memblock_has_mirror(void);
+static __always_inline void vma_set_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff)
+{
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+}
+
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
/*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6ca63e8dda74..e7c9a4dc89f8 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack)
u64 ts_nsec = local_clock();
track->cpu = cpu;
- track->timestamp = ts_nsec >> 3;
+ track->timestamp = ts_nsec >> 9;
#endif /* CONFIG_KASAN_EXTRA_INFO */
track->pid = current->pid;
track->stack = stack;
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 971cfff4ca0b..2d8ae4fbe63b 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit *test)
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
- ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ /* RELOC_HIDE to prevent gcc from warning about short alloc */
+ ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
@@ -697,6 +698,84 @@ static void kmalloc_uaf3(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
}
+static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
+{
+ int *i_unsafe = (int *)unsafe;
+
+ KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+}
+
+static void kasan_atomics(struct kunit *test)
+{
+ void *a1, *a2;
+
+ /*
+ * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such
+ * that the following 16 bytes will make up the redzone.
+ */
+ a1 = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+ a2 = kzalloc(sizeof(int), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+
+ /* Use atomics to access the redzone. */
+ kasan_atomics_helper(test, a1 + 48, a2);
+
+ kfree(a1);
+ kfree(a2);
+}
+
static void kmalloc_double_kzfree(struct kunit *test)
{
char *ptr;
@@ -1883,6 +1962,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_strings),
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kasan_atomics),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index 8b7b3ea2c74e..27ec22767e42 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static int __init test_kasan_module_init(void)
+static int __init kasan_test_module_init(void)
{
/*
* Temporarily enable multi-shot mode. Otherwise, KASAN would only
@@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void)
return -EAGAIN;
}
-module_init(test_kasan_module_init);
+module_init(kasan_test_module_init);
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7afa4feb03e1..b48c768acc84 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
u64 ts_nsec = track->timestamp;
unsigned long rem_usec;
- ts_nsec <<= 3;
+ ts_nsec <<= 9;
rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000;
pr_err("%s by task %u on cpu %d at %lu.%06lus:\n",
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e..fe43fbc44525 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
/* step 4: remove empty page table */
@@ -1665,7 +1665,7 @@ abort:
if (nr_ptes) {
flush_tlb_mm(mm);
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 5d6e2dee5692..0b09daa188ef 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
}
/* Functions from kmsan-checks.h follow. */
+
+/*
+ * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it
+ * into the stack depot. This may cause deadlocks if done from within KMSAN
+ * runtime, therefore we bail out if kmsan_in_runtime().
+ */
void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
{
if (!kmsan_enabled || kmsan_in_runtime())
@@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
}
EXPORT_SYMBOL(kmsan_poison_memory);
+/*
+ * Unlike kmsan_poison_memory(), this function can be used from within KMSAN
+ * runtime, because it does not trigger allocations or call instrumented code.
+ */
void kmsan_unpoison_memory(const void *address, size_t size)
{
unsigned long ua_flags;
- if (!kmsan_enabled || kmsan_in_runtime())
+ if (!kmsan_enabled)
return;
ua_flags = user_access_save();
- kmsan_enter_runtime();
/* The users may want to poison/unpoison random memory. */
kmsan_internal_unpoison_memory((void *)address, size,
KMSAN_POISON_NOCHECK);
- kmsan_leave_runtime();
user_access_restore(ua_flags);
}
EXPORT_SYMBOL(kmsan_unpoison_memory);
/*
- * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
- * runtime.
- *
- * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
- * code. Those regs need to be unpoisoned, otherwise using them will result in
- * false positives.
- * Using kmsan_unpoison_memory() is not an option in entry code, because the
- * return value of in_task() is inconsistent - as a result, certain calls to
- * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
- * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
- * entry code.
+ * Version of kmsan_unpoison_memory() called from IRQ entry functions.
*/
void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
{
- unsigned long ua_flags;
-
- if (!kmsan_enabled)
- return;
-
- ua_flags = user_access_save();
- kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
- KMSAN_POISON_NOCHECK);
- user_access_restore(ua_flags);
+ kmsan_unpoison_memory((void *)regs, sizeof(*regs));
}
void kmsan_check_memory(const void *addr, size_t size)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 35b0147542a9..3fd64736bc45 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
-{
- struct list_lru_one *list =
- list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-
- if (list_empty(item)) {
- list_add_tail(item, &list->list);
- if (!list->nr_items++)
- set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- }
-}
-EXPORT_SYMBOL_GPL(list_lru_putback);
-
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
@@ -257,6 +243,9 @@ restart:
*/
assert_spin_locked(&nlru->lock);
goto restart;
+ case LRU_STOP:
+ assert_spin_locked(&nlru->lock);
+ goto out;
default:
BUG();
}
@@ -567,6 +556,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
lru->shrinker_id = shrinker->id;
else
lru->shrinker_id = -1;
+
+ if (mem_cgroup_kmem_disabled())
+ memcg_aware = false;
#endif
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61932c9215e7..cb216d30a221 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4800,7 +4800,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- mem_cgroup_flush_stats(memcg);
+ mem_cgroup_flush_stats_ratelimited(memcg);
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5621,7 +5621,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
- if (unlikely(mem_cgroup_is_root(memcg)))
+ if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
FLUSH_TIME);
lru_gen_online_memcg(memcg);
@@ -5873,7 +5873,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
}
union mc_target {
- struct page *page;
+ struct folio *folio;
swp_entry_t ent;
};
@@ -5965,23 +5965,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
}
/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
+ * mem_cgroup_move_account - move account of the folio
+ * @folio: The folio.
* @compound: charge the page as compound or small page
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
+ * @from: mem_cgroup which the folio is moved from.
+ * @to: mem_cgroup which the folio is moved to. @from != @to.
*
- * The page must be locked and not on the LRU.
+ * The folio must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
-static int mem_cgroup_move_account(struct page *page,
+static int mem_cgroup_move_account(struct folio *folio,
bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
- struct folio *folio = page_folio(page);
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
@@ -6096,7 +6095,7 @@ out:
* Return:
* * MC_TARGET_NONE - If the pte is not a target for move charge.
* * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- * move charge. If @target is not NULL, the page is stored in target->page
+ * move charge. If @target is not NULL, the folio is stored in target->folio
* with extra refcnt taken (Caller should release it).
* * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
* target for charge migration. If @target is not NULL, the entry is
@@ -6110,6 +6109,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
@@ -6124,9 +6124,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (page)
+ folio = page_folio(page);
if (target && page) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return ret;
}
/*
@@ -6141,8 +6143,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* Alas, skip moving the page in this case.
*/
if (!pte_present(ptent) && page_mapped(page)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
}
@@ -6155,18 +6157,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_coherent_page(page))
+ if (folio_is_device_private(folio) ||
+ folio_is_device_coherent(folio))
ret = MC_TARGET_DEVICE;
if (target)
- target->page = page;
+ target->folio = folio;
}
if (!ret || !target) {
if (target)
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
/*
@@ -6192,6 +6194,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
if (unlikely(is_swap_pmd(pmd))) {
@@ -6201,17 +6204,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
}
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ folio = page_folio(page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return MC_TARGET_NONE;
}
- target->page = page;
+ target->folio = folio;
}
}
return ret;
@@ -6431,7 +6435,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
enum mc_target_type target_type;
union mc_target target;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -6441,26 +6445,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
- page = target.page;
- if (isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (folio_isolate_lru(folio)) {
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- putback_lru_page(page);
+ folio_putback_lru(folio);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
} else if (target_type == MC_TARGET_DEVICE) {
- page = target.page;
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
spin_unlock(ptl);
return 0;
@@ -6483,28 +6487,28 @@ retry:
device = true;
fallthrough;
case MC_TARGET_PAGE:
- page = target.page;
+ folio = target.folio;
/*
* We can have a part of the split pmd here. Moving it
* can be done but it would be too convoluted so simply
* ignore such a partial THP and keep it in original
* memcg. There should be somebody mapping the head.
*/
- if (PageTransCompound(page))
+ if (folio_test_large(folio))
goto put;
- if (!device && !isolate_lru_page(page))
+ if (!device && !folio_isolate_lru(folio))
goto put;
- if (!mem_cgroup_move_account(page, false,
+ if (!mem_cgroup_move_account(folio, false,
mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
if (!device)
- putback_lru_page(page);
+ folio_putback_lru(folio);
put: /* get_mctgt_type() gets & locks the page */
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
case MC_TARGET_SWAP:
ent = target.ent;
@@ -6977,6 +6981,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
unsigned long reclaimed;
if (signal_pending(current))
@@ -6991,8 +6997,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
- GFP_KERNEL, reclaim_options);
+ batch_size, GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 5462d9e3c84c..0537664620e5 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -39,7 +39,7 @@ static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
-static struct bus_type memory_tier_subsys = {
+static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
.dev_name = "memory_tier",
};
@@ -359,6 +359,26 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
+static void dump_demotion_targets(void)
+{
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ struct memory_tier *memtier = __node_get_memory_tier(node);
+ nodemask_t preferred = node_demotion[node].preferred;
+
+ if (!memtier)
+ continue;
+
+ if (nodes_empty(preferred))
+ pr_info("Demotion targets for Node %d: null\n", node);
+ else
+ pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
+ node, nodemask_pr_args(&preferred),
+ nodemask_pr_args(&memtier->lower_tier_mask));
+ }
+}
+
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -443,7 +463,7 @@ static void establish_demotion_targets(void)
* Now build the lower_tier mask for each node collecting node mask from
* all memory tier below it. This allows us to fallback demotion page
* allocation to a set of nodes that is closer the above selected
- * perferred node.
+ * preferred node.
*/
lower_tier = node_states[N_MEMORY];
list_for_each_entry(memtier, &memory_tiers, list) {
@@ -456,6 +476,8 @@ static void establish_demotion_targets(void)
nodes_andnot(lower_tier, lower_tier, tier_nodes);
memtier->lower_tier_mask = lower_tier;
}
+
+ dump_demotion_targets();
}
#else
diff --git a/mm/memory.c b/mm/memory.c
index 0bfc8b007c01..642b4f2be523 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* keep things as they are.
*/
folio_get(folio);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
folio_try_dup_anon_rmap_pte(folio, page, src_vma);
@@ -930,68 +930,187 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
return 0;
}
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+ pte_t pte, unsigned long addr, int nr)
+{
+ struct mm_struct *src_mm = src_vma->vm_mm;
+
+ /* If it's a COW mapping, write protect it both processes. */
+ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+ wrprotect_ptes(src_mm, addr, src_pte, nr);
+ pte = pte_wrprotect(pte);
+ }
+
+ /* If it's a shared mapping, mark it clean in the child. */
+ if (src_vma->vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_clear_uffd_wp(pte);
+
+ set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
+/* Flags for folio_pte_batch(). */
+typedef int __bitwise fpb_t;
+
+/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
+#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+
+/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
+#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+
+static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
+{
+ if (flags & FPB_IGNORE_DIRTY)
+ pte = pte_mkclean(pte);
+ if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ pte = pte_clear_soft_dirty(pte);
+ return pte_wrprotect(pte_mkold(pte));
+}
+
+/*
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same folio.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
+ * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ *
+ * If "any_writable" is set, it will indicate if any other PTE besides the
+ * first (given) PTE is writable.
+ */
+static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+ pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+ bool *any_writable)
+{
+ unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t expected_pte, *ptep;
+ bool writable;
+ int nr;
+
+ if (any_writable)
+ *any_writable = false;
+
+ VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+
+ nr = pte_batch_hint(start_ptep, pte);
+ expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+ ptep = start_ptep + nr;
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+ if (any_writable)
+ writable = !!pte_write(pte);
+ pte = __pte_batch_clear_ignored(pte, flags);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ /*
+ * Stop immediately once we reached the end of the folio. In
+ * corner cases the next PFN might fall into a different
+ * folio.
+ */
+ if (pte_pfn(pte) >= folio_end_pfn)
+ break;
+
+ if (any_writable)
+ *any_writable |= writable;
+
+ nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, nr);
+ ptep += nr;
+ }
+
+ return min(ptep - start_ptep, max_nr);
+}
+
/*
- * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
*/
static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct folio **prealloc)
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+ int max_nr, int *rss, struct folio **prealloc)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
- unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
+ bool any_writable;
+ fpb_t flags = 0;
+ int err, nr;
page = vm_normal_page(src_vma, addr, pte);
- if (page)
- folio = page_folio(page);
- if (page && folio_test_anon(folio)) {
+ if (unlikely(!page))
+ goto copy_pte;
+
+ folio = page_folio(page);
+
+ /*
+ * If we likely have to copy, just don't bother with batching. Make
+ * sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+ if (src_vma->vm_flags & VM_SHARED)
+ flags |= FPB_IGNORE_DIRTY;
+ if (!vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_IGNORE_SOFT_DIRTY;
+
+ nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+ &any_writable);
+ folio_ref_add(folio, nr);
+ if (folio_test_anon(folio)) {
+ if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+ nr, src_vma))) {
+ folio_ref_sub(folio, nr);
+ return -EAGAIN;
+ }
+ rss[MM_ANONPAGES] += nr;
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
+ folio_dup_file_rmap_ptes(folio, page, nr);
+ rss[mm_counter_file(folio)] += nr;
+ }
+ if (any_writable)
+ pte = pte_mkwrite(pte, src_vma);
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+ addr, nr);
+ return nr;
+ }
+
+ folio_get(folio);
+ if (folio_test_anon(folio)) {
/*
* If this page may have been pinned by the parent process,
* copy the page immediately for the child so that we'll always
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- folio_get(folio);
if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
- return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, page);
+ err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ return err ? err : 1;
}
rss[MM_ANONPAGES]++;
- } else if (page) {
- folio_get(folio);
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
folio_dup_file_rmap_pte(folio, page);
- rss[mm_counter_file(page)]++;
+ rss[mm_counter_file(folio)]++;
}
- /*
- * If it's a COW mapping, write protect it both
- * in the parent and the child
- */
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
- ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = pte_wrprotect(pte);
- }
- VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
-
- /*
- * If it's a shared mapping, mark it clean in
- * the child
- */
- if (vm_flags & VM_SHARED)
- pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
-
- if (!userfaultfd_wp(dst_vma))
- pte = pte_clear_uffd_wp(pte);
-
- set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
- return 0;
+copy_pte:
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+ return 1;
}
static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1028,10 +1147,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *src_pte, *dst_pte;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
- int progress, ret = 0;
+ int progress, max_nr, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct folio *prealloc = NULL;
+ int nr;
again:
progress = 0;
@@ -1062,6 +1182,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
+ nr = 1;
+
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
@@ -1091,6 +1213,8 @@ again:
progress += 8;
continue;
}
+ ptent = ptep_get(src_pte);
+ VM_WARN_ON_ONCE(!pte_present(ptent));
/*
* Device exclusive entry restored, continue by copying
@@ -1098,9 +1222,10 @@ again:
*/
WARN_ON_ONCE(ret != -ENOENT);
}
- /* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, &prealloc);
+ /* copy_present_ptes() will clear `*prealloc' if consumed */
+ max_nr = (end - addr) / PAGE_SIZE;
+ ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+ ptent, addr, max_nr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1117,8 +1242,10 @@ again:
folio_put(prealloc);
prealloc = NULL;
}
- progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ nr = ret;
+ progress += 8 * nr;
+ } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+ addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1139,7 +1266,7 @@ again:
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
if (!prealloc)
return -ENOMEM;
- } else if (ret) {
+ } else if (ret < 0) {
VM_WARN_ON_ONCE(1);
}
@@ -1369,19 +1496,16 @@ static inline bool should_zap_cows(struct zap_details *details)
return details->even_cows;
}
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+ struct folio *folio)
{
- /* If we can make a decision without *page.. */
+ /* If we can make a decision without *folio.. */
if (should_zap_cows(details))
return true;
- /* E.g. the caller passes NULL for the case of a zero page */
- if (!page)
- return true;
-
- /* Otherwise we should only zap non-anon pages */
- return !PageAnon(page);
+ /* Otherwise we should only zap non-anon folios */
+ return !folio_test_anon(folio);
}
static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1398,7 +1522,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
*/
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte,
+ unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
/* Zap on anonymous always means dropping everything */
@@ -1408,7 +1532,111 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (zap_drop_file_uffd_wp(details))
return;
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ for (;;) {
+ /* the PFN in the PTE is irrelevant. */
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (--nr == 0)
+ break;
+ pte++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct folio *folio,
+ struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+ unsigned long addr, struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break)
+{
+ struct mm_struct *mm = tlb->mm;
+ bool delay_rmap = false;
+
+ if (!folio_test_anon(folio)) {
+ ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ if (pte_dirty(ptent)) {
+ folio_mark_dirty(folio);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = true;
+ *force_flush = true;
+ }
+ }
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
+ rss[mm_counter(folio)] -= nr;
+ } else {
+ /* We don't need up-to-date accessed/dirty bits. */
+ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ rss[MM_ANONPAGES] -= nr;
+ }
+ /* Checking a single PTE in a batch is sufficient. */
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
+ if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+ ptent);
+
+ if (!delay_rmap) {
+ folio_remove_rmap_ptes(folio, page, nr, vma);
+
+ /* Only sanity-check the first page in a batch. */
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
+ *force_flush = true;
+ *force_break = true;
+ }
+}
+
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *force_flush,
+ bool *force_break)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ struct mm_struct *mm = tlb->mm;
+ struct folio *folio;
+ struct page *page;
+ int nr;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page) {
+ /* We don't need up-to-date accessed/dirty bits. */
+ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+ ksm_might_unmap_zero_page(mm, ptent);
+ return 1;
+ }
+
+ folio = page_folio(page);
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+
+ /*
+ * Make sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+ NULL);
+
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ addr, details, rss, force_flush,
+ force_break);
+ return nr;
+ }
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+ details, rss, force_flush, force_break);
+ return 1;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1416,13 +1644,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ int nr;
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
@@ -1436,7 +1665,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t ptent = ptep_get(pte);
struct folio *folio;
struct page *page;
+ int max_nr;
+ nr = 1;
if (pte_none(ptent))
continue;
@@ -1444,44 +1675,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- unsigned int delay_rmap;
-
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(!should_zap_page(details, page)))
- continue;
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- arch_check_zapped_pte(vma, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
- if (unlikely(!page)) {
- ksm_might_unmap_zero_page(mm, ptent);
- continue;
- }
-
- folio = page_folio(page);
- delay_rmap = 0;
- if (!folio_test_anon(folio)) {
- if (pte_dirty(ptent)) {
- folio_mark_dirty(folio);
- if (tlb_delay_rmap(tlb)) {
- delay_rmap = 1;
- force_flush = 1;
- }
- }
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
- folio_mark_accessed(folio);
- }
- rss[mm_counter(page)]--;
- if (!delay_rmap) {
- folio_remove_rmap_pte(folio, page, vma);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- }
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
- force_flush = 1;
- addr += PAGE_SIZE;
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+ addr, details, rss, &force_flush,
+ &force_break);
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
break;
}
continue;
@@ -1492,7 +1691,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
is_device_exclusive_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
folio = page_folio(page);
- if (unlikely(!should_zap_page(details, page)))
+ if (unlikely(!should_zap_folio(details, folio)))
continue;
/*
* Both device private/exclusive mappings should only
@@ -1501,7 +1700,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* see zap_install_uffd_wp_if_needed().
*/
WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
if (is_device_private_entry(entry))
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
@@ -1513,10 +1712,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- if (!should_zap_page(details, page))
+ folio = pfn_swap_entry_folio(entry);
+ if (!should_zap_folio(details, folio))
continue;
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
} else if (pte_marker_entry_uffd_wp(entry)) {
/*
* For anon: always drop the marker; for file: only
@@ -1535,8 +1734,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1870,7 +2069,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
folio_get(folio);
- inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -3175,7 +3374,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
@@ -4170,8 +4369,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long orders;
struct folio *folio;
unsigned long addr;
@@ -4223,15 +4422,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr, true);
if (folio) {
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
+ goto next;
+ }
+ folio_throttle_swaprate(folio, gfp);
clear_huge_page(&folio->page, vmf->address, 1 << order);
return folio;
}
+next:
order = next_order(&orders, order);
}
fallback:
#endif
- return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+ return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}
/*
@@ -4298,10 +4503,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
nr_pages = folio_nr_pages(folio);
addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
- if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
- goto oom_free_page;
- folio_throttle_swaprate(folio, GFP_KERNEL);
-
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
@@ -4355,8 +4556,6 @@ unlock:
release:
folio_put(folio);
goto unlock;
-oom_free_page:
- folio_put(folio);
oom:
return VM_FAULT_OOM;
}
@@ -4480,7 +4679,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
folio_add_file_rmap_pmd(folio, page, vma);
/*
@@ -4543,7 +4742,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
} else {
- add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
folio_add_file_rmap_ptes(folio, page, nr, vma);
}
set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -6163,7 +6362,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
- clear_user_highpage(page + idx, addr);
+ clear_user_highpage(nth_page(page, idx), addr);
return 0;
}
@@ -6213,10 +6412,11 @@ struct copy_subpage_arg {
static int copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
+ struct page *dst = nth_page(copy_arg->dst, idx);
+ struct page *src = nth_page(copy_arg->src, idx);
- if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
- addr, copy_arg->vma)) {
- memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+ if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
return -EHWPOISON;
}
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1d3..a444e2d7dd2b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone)
+ struct zone *zone, bool mhp_off_inaccessible)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
+ /*
+ * Memory block is accessible at this stage and hence poison the struct
+ * pages now. If the memory block is accessible during memory hotplug
+ * addition phase, then page poisining is already performed in
+ * sparse_add_section().
+ */
+ if (mhp_off_inaccessible)
+ page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
for (i = 0; i < nr_pages; i++)
@@ -1328,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
}
#endif
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
{
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1337,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* Besides having arch support and the feature enabled at runtime, we
* need a few more assumptions to hold true:
*
- * a) We span a single memory block: memory onlining/offlinin;g happens
- * in memory block granularity. We don't want the vmemmap of online
- * memory blocks to reside on offline memory blocks. In the future,
- * we might want to support variable-sized memory blocks to make the
- * feature more versatile.
- *
- * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
* to populate memory from the altmap for unrelated parts (i.e.,
* other memory blocks)
*
- * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * b) The vmemmap pages (and thereby the pages that will be exposed to
* the buddy) have to cover full pageblocks: memory onlining/offlining
* code requires applicable ranges to be page-aligned, for example, to
* set the migratetypes properly.
@@ -1359,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ if (!mhp_memmap_on_memory())
return false;
/*
@@ -1382,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
@@ -1415,7 +1419,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size)
+ u64 start, u64 size, mhp_t mhp_flags)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1431,6 +1435,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+ mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1515,8 +1521,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
- mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+ mhp_supports_memmap_on_memory()) {
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
if (ret)
goto error;
} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..56f9a6ed939a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
* for anonymous memory. For process policy an process counter
* is used.
*
+ * weighted interleave
+ * Allocate memory interleaved over a set of nodes based on
+ * a set of weights (per-node), with normal fallback if it
+ * fails. Otherwise operates the same as interleave.
+ * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ * on node 0 for every 1 page allocated on node 1.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
@@ -131,6 +138,32 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
+static u8 get_il_weight(int node)
+{
+ u8 *table;
+ u8 weight;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* if no iw_table, use system default */
+ weight = table ? table[node] : 1;
+ /* if value in iw_table is 0, use system default */
+ weight = weight ? weight : 1;
+ rcu_read_unlock();
+ return weight;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -415,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
};
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -654,7 +691,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
{
struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
- unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
@@ -682,9 +718,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
!(flags & MPOL_MF_STRICT))
return 1;
- if (endvma > end)
- endvma = end;
-
/*
* Check page nodes, and queue pages to move, in the current vma.
* But if no moving, and no strict checking, the scan can be skipped.
@@ -836,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_weight = 0;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -862,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
*nodes = pol->nodes;
break;
case MPOL_LOCAL:
@@ -946,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ if (current->il_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1310,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len,
* VMAs, the nodes will still be interleaved from the targeted
* nodemask, but one by one may be selected differently.
*/
- if (new->mode == MPOL_INTERLEAVE) {
+ if (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct page *page;
unsigned int order;
unsigned long addr = -EFAULT;
@@ -1758,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
* @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ * MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
@@ -1775,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
@@ -1825,12 +1872,40 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int node;
+ unsigned int cpuset_mems_cookie;
+
+retry:
+ /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ node = current->il_prev;
+ if (!current->il_weight || !node_isset(node, policy->nodes)) {
+ node = next_node_in(node, policy->nodes);
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry;
+ if (node == MAX_NUMNODES)
+ return node;
+ current->il_prev = node;
+ current->il_weight = get_il_weight(node);
+ }
+ current->il_weight--;
+ return node;
+}
+
/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
unsigned int nid;
+ unsigned int cpuset_mems_cookie;
+
+ /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nid = next_node_in(current->il_prev, policy->nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
- nid = next_node_in(current->il_prev, policy->nodes);
if (nid < MAX_NUMNODES)
current->il_prev = nid;
return nid;
@@ -1859,6 +1934,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
@@ -1883,6 +1961,59 @@ unsigned int mempolicy_slab_node(void)
}
}
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+ nodemask_t *mask)
+{
+ /*
+ * barrier stabilizes the nodemask locally so that it can be iterated
+ * over safely without concern for changes. Allocators validate node
+ * selection does not violate mems_allowed, so this is safe.
+ */
+ barrier();
+ memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+ barrier();
+ return nodes_weight(*mask);
+}
+
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ nodemask_t nodemask;
+ unsigned int target, nr_nodes;
+ u8 *table;
+ unsigned int weight_total = 0;
+ u8 weight;
+ int nid;
+
+ nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nr_nodes)
+ return numa_node_id();
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* calculate the total weight */
+ for_each_node_mask(nid, nodemask) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ weight_total += weight;
+ }
+
+ /* Calculate the node offset based on totals */
+ target = ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ rcu_read_unlock();
+ return nid;
+}
+
/*
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1890,20 +2021,12 @@ unsigned int mempolicy_slab_node(void)
*/
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
- nodemask_t nodemask = pol->nodes;
+ nodemask_t nodemask;
unsigned int target, nnodes;
int i;
int nid;
- /*
- * The barrier will stabilize the nodemask in a register or on
- * the stack so that it will stop changing under the code.
- *
- * Between first_node() and next_node(), pol->nodes could be changed
- * by other threads. So we put pol->nodes in a local stack.
- */
- barrier();
- nnodes = nodes_weight(nodemask);
+ nnodes = read_once_policy_nodemask(pol, &nodemask);
if (!nnodes)
return numa_node_id();
target = ilx % nnodes;
@@ -1951,6 +2074,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
}
return nodemask;
@@ -2012,6 +2140,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2112,6 +2241,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
(!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
@@ -2247,6 +2377,121 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned int cpuset_mems_cookie;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated = 0;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ u8 *table, *weights, weight;
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes;
+ int nnodes, node;
+ int resume_node = MAX_NUMNODES - 1;
+ u8 resume_weight = 0;
+ int prev_node;
+ int i;
+
+ if (!nr_pages)
+ return 0;
+
+ /* read the nodes onto the stack, retry if done during rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nnodes = read_once_policy_nodemask(pol, &nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ /* if the nodemask has become invalid, we cannot do anything */
+ if (!nnodes)
+ return 0;
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ node = me->il_prev;
+ weight = me->il_weight;
+ if (weight && node_isset(node, nodes)) {
+ node_pages = min(rem_pages, weight);
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= weight) {
+ me->il_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust remaining pages, continue from there */
+ rem_pages -= weight;
+ }
+ /* clear active weight in case of an allocation failure */
+ me->il_weight = 0;
+ prev_node = node;
+
+ /* create a local copy of node weights to operate on outside rcu */
+ weights = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!weights)
+ return total_allocated;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ if (table)
+ memcpy(weights, table, nr_node_ids);
+ rcu_read_unlock();
+
+ /* calculate total, detect system default usage */
+ for_each_node_mask(node, nodes) {
+ if (!weights[node])
+ weights[node] = 1;
+ weight_total += weights[node];
+ }
+
+ /*
+ * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+ * Track which node weighted interleave should resume from.
+ *
+ * if (rounds > 0) and (delta == 0), resume_node will always be
+ * the node following prev_node and its weight.
+ */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ resume_node = next_node_in(prev_node, nodes);
+ resume_weight = weights[resume_node];
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ /* If a delta exists, add this node's portion of the delta */
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else if (delta) {
+ /* when delta is depleted, resume from that node */
+ node_pages += delta;
+ resume_node = node;
+ resume_weight = weight - delta;
+ delta = 0;
+ }
+ /* node_pages can be 0 if an allocation fails and rounds == 0 */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ if (total_allocated == nr_pages)
+ break;
+ prev_node = node;
+ }
+ me->il_prev = resume_node;
+ me->il_weight = resume_weight;
+ kfree(weights);
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2287,6 +2532,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(
+ gfp, pol, nr_pages, page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2362,6 +2611,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2498,6 +2748,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
+ break;
+
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
@@ -2872,6 +3126,7 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2931,6 +3186,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -3041,6 +3297,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
@@ -3067,3 +3324,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+ struct kobj_attribute kobj_attr;
+ int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ weight = get_il_weight(node_attr->nid);
+ return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ u8 *new;
+ u8 *old;
+ u8 weight = 0;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ if (count == 0 || sysfs_streq(buf, ""))
+ weight = 0;
+ else if (kstrtou8(buf, 0, &weight))
+ return -EINVAL;
+
+ new = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ if (old)
+ memcpy(new, old, nr_node_ids);
+ new[node_attr->nid] = weight;
+ rcu_assign_pointer(iw_table, new);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+ struct kobject *parent)
+{
+ if (!node_attr)
+ return;
+ sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ sysfs_wi_node_release(node_attrs[i], wi_kobj);
+ kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+ struct iw_node_attr *node_attr;
+ char *name;
+
+ node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+ if (!node_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "node%d", nid);
+ if (!name) {
+ kfree(node_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&node_attr->kobj_attr.attr);
+ node_attr->kobj_attr.attr.name = name;
+ node_attr->kobj_attr.attr.mode = 0644;
+ node_attr->kobj_attr.show = node_show;
+ node_attr->kobj_attr.store = node_store;
+ node_attr->nid = nid;
+
+ if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+ pr_err("failed to add attribute to weighted_interleave\n");
+ return -ENOMEM;
+ }
+
+ node_attrs[nid] = node_attr;
+ return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+ struct kobject *wi_kobj;
+ int nid, err;
+
+ wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!wi_kobj)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ "weighted_interleave");
+ if (err) {
+ kfree(wi_kobj);
+ return err;
+ }
+
+ for_each_node_state(nid, N_POSSIBLE) {
+ err = add_weight_node(nid, wi_kobj);
+ if (err) {
+ pr_err("failed to add sysfs [node%d]\n", nid);
+ break;
+ }
+ }
+ if (err)
+ kobject_put(wi_kobj);
+ return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+ u8 *old;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ rcu_assign_pointer(iw_table, NULL);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ kfree(node_attrs);
+ kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+ .release = mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ static struct kobject *mempolicy_kobj;
+
+ mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+ if (!mempolicy_kobj) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+ GFP_KERNEL);
+ if (!node_attrs) {
+ err = -ENOMEM;
+ goto mempol_out;
+ }
+
+ err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+ "mempolicy");
+ if (err)
+ goto node_out;
+
+ err = add_weighted_interleave_group(mempolicy_kobj);
+ if (err) {
+ pr_err("mempolicy sysfs structure failed to initialize\n");
+ kobject_put(mempolicy_kobj);
+ return err;
+ }
+
+ return err;
+node_out:
+ kfree(node_attrs);
+mempol_out:
+ kfree(mempolicy_kobj);
+err_out:
+ pr_err("failed to add mempolicy kobject to the system\n");
+ return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/mm/migrate.c b/mm/migrate.c
index c27b1f8097d4..73a052a382f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
old_pte = ptep_get(pvmw.pte);
- if (pte_swp_soft_dirty(old_pte))
- pte = pte_mksoft_dirty(pte);
entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
+ if (pte_swp_soft_dirty(old_pte))
+ pte = pte_mksoft_dirty(pte);
+ else
+ pte = pte_clear_soft_dirty(pte);
+
if (is_writable_migration_entry(entry))
pte = pte_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(old_pte))
diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b..ccf377ee319f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
* Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
- struct file *file, struct address_space *mapping)
+ struct address_space *mapping)
{
if (vma_is_shared_maywrite(vma))
mapping_unmap_writable(mapping);
@@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
i_mmap_lock_write(mapping);
- __remove_shared_vm_struct(vma, file, mapping);
+ __remove_shared_vm_struct(vma, mapping);
i_mmap_unlock_write(mapping);
}
}
@@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}
+static void vma_link_file(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping;
+
+ if (file) {
+ mapping = file->f_mapping;
+ i_mmap_lock_write(mapping);
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, mm, 0);
- struct address_space *mapping = NULL;
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
vma_start_write(vma);
-
vma_iter_store(&vmi, vma);
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- __vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-
+ vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
return 0;
@@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp,
}
if (vp->remove && vp->file) {
- __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+ __remove_shared_vm_struct(vp->remove, vp->mapping);
if (vp->remove2)
- __remove_shared_vm_struct(vp->remove2, vp->file,
- vp->mapping);
+ __remove_shared_vm_struct(vp->remove2, vp->mapping);
} else if (vp->insert) {
/*
* split_vma has split insert from vma, and needs
@@ -660,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
@@ -705,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_adjust_trans_huge(vma, start, end, 0);
vma_iter_clear(vmi);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_complete(&vp, vmi, vma->vm_mm);
return 0;
}
@@ -861,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* area is returned, or the function will return NULL
*/
static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
- struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy,
+*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *src, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
+ struct mm_struct *mm = src->vm_mm;
+ struct anon_vma *anon_vma = src->anon_vma;
+ struct file *file = src->vm_file;
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
struct vm_area_struct *anon_dup = NULL;
@@ -1012,10 +1013,7 @@ static struct vm_area_struct
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-
- vma->vm_start = vma_start;
- vma->vm_end = vma_end;
- vma->vm_pgoff = vma_pgoff;
+ vma_set_range(vma, vma_start, vma_end, vma_pgoff);
if (vma_expanded)
vma_iter_store(vmi, vma);
@@ -2048,7 +2046,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2142,7 +2139,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2432,9 +2428,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
struct vm_area_struct *merged;
- merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, policy,
- uffd_ctx, anon_name);
+ merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
+ pgoff, policy, uffd_ctx, anon_name);
if (merged)
return merged;
@@ -2464,9 +2459,8 @@ static struct vm_area_struct
struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff)
{
- return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2480,10 +2474,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
/* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta,
- vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
+ return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
+ vma->vm_flags, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2810,11 +2803,9 @@ cannot_expand:
}
vma_iter_config(&vmi, addr, end);
- vma->vm_start = addr;
- vma->vm_end = end;
+ vma_set_range(vma, addr, end, pgoff);
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
if (file) {
vma->vm_file = get_file(file);
@@ -2891,16 +2882,7 @@ cannot_expand:
vma_start_write(vma);
vma_iter_store(&vmi, vma);
mm->map_count++;
- if (vma->vm_file) {
- i_mmap_lock_write(vma->vm_file->f_mapping);
- if (vma_is_shared_maywrite(vma))
- mapping_allow_writable(vma->vm_file->f_mapping);
-
- flush_dcache_mmap_lock(vma->vm_file->f_mapping);
- vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
- flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- }
+ vma_link_file(vma);
/*
* vma_merge() calls khugepaged_enter_vma() either, the below
@@ -3173,9 +3155,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto unacct_fail;
vma_set_anonymous(vma);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_pgoff = addr >> PAGE_SHIFT;
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
vma_start_write(vma);
@@ -3412,9 +3392,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma = vm_area_dup(vma);
if (!new_vma)
goto out;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
+ vma_set_range(new_vma, addr, addr + len, pgoff);
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
if (anon_vma_clone(new_vma, vma))
@@ -3582,9 +3560,7 @@ static struct vm_area_struct *__install_special_mapping(
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
-
+ vma_set_range(vma, addr, addr + len, 0);
vm_flags_init(vma, (vm_flags | mm->def_flags |
VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
@@ -3868,7 +3844,7 @@ static int init_user_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
return 0;
}
subsys_initcall(init_user_reserve);
@@ -3889,7 +3865,7 @@ static int init_admin_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
return 0;
}
subsys_initcall(init_admin_reserve);
@@ -3921,12 +3897,12 @@ static int reserve_mem_notifier(struct notifier_block *nb,
case MEM_ONLINE:
/* Default max is 128MB. Leave alone if modified by operator. */
tmp = sysctl_user_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 17))
+ if (tmp > 0 && tmp < SZ_128K)
init_user_reserve();
/* Default max is 8MB. Leave alone if modified by operator. */
tmp = sysctl_admin_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 13))
+ if (tmp > 0 && tmp < SZ_8K)
init_admin_reserve();
break;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 604ddf08affe..99b3e9408aa0 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
+ struct encoded_page **pages = batch->encoded_pages;
+
for (int i = 0; i < batch->nr; i++) {
- struct encoded_page *enc = batch->encoded_pages[i];
+ struct encoded_page *enc = pages[i];
- if (encoded_page_flags(enc)) {
+ if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
struct page *page = encoded_page_ptr(enc);
- folio_remove_rmap_pte(page_folio(page), page, vma);
+ unsigned int nr_pages = 1;
+
+ if (unlikely(encoded_page_flags(enc) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages = encoded_nr_pages(pages[++i]);
+
+ folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
+ vma);
}
}
}
@@ -82,26 +91,62 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
}
#endif
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE 512
+
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
- struct mmu_gather_batch *batch;
+ struct encoded_page **pages = batch->encoded_pages;
+ unsigned int nr, nr_pages;
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- struct encoded_page **pages = batch->encoded_pages;
+ while (batch->nr) {
+ if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+ nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
- do {
/*
- * limit free batch count when PAGE_SIZE > 4K
+ * Make sure we cover page + nr_pages, and don't leave
+ * nr_pages behind when capping the number of entries.
+ */
+ if (unlikely(encoded_page_flags(pages[nr - 1]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr++;
+ } else {
+ /*
+ * With page poisoning and init_on_free, the time it
+ * takes to free memory grows proportionally with the
+ * actual memory size. Therefore, limit based on the
+ * actual memory size and not the number of involved
+ * folios.
*/
- unsigned int nr = min(512U, batch->nr);
+ for (nr = 0, nr_pages = 0;
+ nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
+ nr++) {
+ if (unlikely(encoded_page_flags(pages[nr]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages += encoded_nr_pages(pages[++nr]);
+ else
+ nr_pages++;
+ }
+ }
- free_pages_and_swap_cache(pages, nr);
- pages += nr;
- batch->nr -= nr;
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
- cond_resched();
- } while (batch->nr);
+ cond_resched();
}
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
+ __tlb_batch_free_encoded_pages(batch);
tlb->active = &tlb->local;
}
@@ -116,14 +161,19 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
+static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap,
+ int page_size)
{
+ int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
+ VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
+ VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif
batch = tlb->active;
@@ -131,17 +181,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->encoded_pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
+ if (likely(nr_pages == 1)) {
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ } else {
+ flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
+ }
+ /*
+ * Make sure that we can always add another "page" + "nr_pages",
+ * requiring two entries instead of only a single one.
+ */
+ if (batch->nr >= batch->max - 1) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
+ VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
return false;
}
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+ unsigned int nr_pages, bool delay_rmap)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
+ PAGE_SIZE);
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ bool delay_rmap, int page_size)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+}
+
#endif /* MMU_GATHER_NO_GATHER */
#ifdef CONFIG_MMU_GATHER_TABLE_FREE
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f785..f8a4544b4601 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t newpte;
if (is_writable_migration_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
/*
* A protection check is difficult so
* just be safe and disable write
*/
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..9faca05d124e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
-static void zone_pcp_update_cacheinfo(struct zone *zone)
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
- int cpu;
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
- for_each_online_cpu(cpu) {
- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- cci = get_cpu_cacheinfo(cpu);
- /*
- * If data cache slice of CPU is large enough, "pcp->batch"
- * pages can be preserved in PCP before draining PCP for
- * consecutive high-order pages freeing without allocation.
- * This can reduce zone lock contention without hurting
- * cache-hot pages sharing.
- */
- spin_lock(&pcp->lock);
- if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
- pcp->flags |= PCPF_FREE_HIGH_BATCH;
- else
- pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- spin_unlock(&pcp->lock);
- }
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
}
-void setup_pcp_cacheinfo(void)
+void setup_pcp_cacheinfo(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone)
- zone_pcp_update_cacheinfo(zone);
+ zone_pcp_update_cacheinfo(zone, cpu);
}
/*
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 03c1bdae4a43..106e1d66e9f9 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
+#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
@@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
}
+
+static int check_wx_show(struct seq_file *m, void *v)
+{
+ if (ptdump_check_wx())
+ seq_puts(m, "SUCCESS\n");
+ else
+ seq_puts(m, "FAILED\n");
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(check_wx);
+
+static int ptdump_debugfs_init(void)
+{
+ debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
+
+ return 0;
+}
+
+device_initcall(ptdump_debugfs_init);
diff --git a/mm/readahead.c b/mm/readahead.c
index 2648ec4f0494..1e74455f908e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
if (new_order < MAX_PAGECACHE_ORDER) {
new_order += 2;
- if (new_order > MAX_PAGECACHE_ORDER)
- new_order = MAX_PAGECACHE_ORDER;
- while ((1 << new_order) > ra->size)
- new_order--;
+ new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ new_order = min_t(unsigned int, new_order, ilog2(ra->size));
}
filemap_invalidate_lock_shared(mapping);
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529..3746a5531018 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
swp_entry_t entry = page_swap_entry(subpage);
pte_t swp_pte;
@@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(&folio->page));
+ dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
if (unlikely(folio_test_hugetlb(folio)))
@@ -2169,7 +2169,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else {
swp_entry_t entry;
pte_t swp_pte;
@@ -2261,7 +2261,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
diff --git a/mm/sparse.c b/mm/sparse.c
index 338cf946dee8..aed0951b87fa 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(memmap, sizeof(struct page) * nr_pages);
+ if (!altmap || !altmap->inaccessible)
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba3a..e5380d732c0d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -967,11 +967,17 @@ void release_pages(release_pages_arg arg, int nr)
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
+ unsigned int nr_refs = 1;
struct folio *folio;
/* Turn any of the argument types into a folio */
folio = page_folio(encoded_page_ptr(encoded[i]));
+ /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+ if (unlikely(encoded_page_flags(encoded[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_refs = encoded_nr_pages(encoded[++i]);
+
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -990,14 +996,14 @@ void release_pages(release_pages_arg arg, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(&folio->page))
+ if (put_devmap_managed_page_refs(&folio->page, nr_refs))
continue;
- if (folio_put_testzero(folio))
+ if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_page(&folio->page);
continue;
}
- if (!folio_put_testzero(folio))
+ if (!folio_ref_sub_and_test(folio, nr_refs))
continue;
if (folio_test_large(folio)) {
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f8e..90973ce7881d 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
+ /* Large folio swap slot is not covered. */
+ zswap_invalidate(entry);
+
cache = raw_cpu_ptr(&swp_slots);
if (likely(use_swap_slot_cache && cache->slots_ret)) {
spin_lock_irq(&cache->free_lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7255c01a1e4e..2f540748f7c0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page)
void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
lru_add_drain();
- for (int i = 0; i < nr; i++)
- free_swap_cache(encoded_page_ptr(pages[i]));
+ for (int i = 0; i < nr; i++) {
+ struct page *page = encoded_page_ptr(pages[i]);
+
+ /*
+ * Skip over the "nr_pages" entry. It's sufficient to call
+ * free_swap_cache() only once per folio.
+ */
+ if (unlikely(encoded_page_flags(pages[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ i++;
+
+ free_swap_cache(page);
+ }
release_pages(pages, nr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746aa9da5302..d1bd8d1e17bd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
- atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -746,12 +744,19 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify = NULL;
while (offset <= end) {
arch_swap_invalidate_page(si->type, offset);
- zswap_invalidate(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
+
+ /*
+ * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+ * only after the above cleanups are done.
+ */
+ smp_wmb();
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -2049,7 +2054,7 @@ static int try_to_unuse(unsigned int type)
unsigned int i;
if (!READ_ONCE(si->inuse_pages))
- return 0;
+ goto success;
retry:
retval = shmem_unuse(type);
@@ -2130,6 +2135,12 @@ retry:
return -EINTR;
}
+success:
+ /*
+ * Make sure that further cleanups after try_to_unuse() returns happen
+ * after swap_range_free() reduces si->inuse_pages to 0.
+ */
+ smp_mb();
return 0;
}
@@ -2348,8 +2359,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- zswap_swapon(p->type);
-
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -3167,6 +3176,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
+ error = zswap_swapon(p->type, maxpages);
+ if (error)
+ goto free_swap_address_space;
+
/*
* Flush any pending IO and dirty mappings before we start using this
* swap device.
@@ -3175,7 +3188,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto free_swap_address_space;
+ goto free_swap_zswap;
}
mutex_lock(&swapon_mutex);
@@ -3199,6 +3212,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_zswap:
+ zswap_swapoff(p->type);
free_swap_address_space:
exit_swap_address_space(p->type);
bad_swap_unlock_inode:
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7cf7d4384259..4744d6a96f96 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,19 +20,11 @@
#include "internal.h"
static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
- /*
- * Make sure that the dst range is both valid and fully within a
- * single existing vma.
- */
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
- if (!range_in_vma(dst_vma, dst_start, dst_start + len))
- return NULL;
+ /* Make sure that the dst range is fully within dst_vma. */
+ if (dst_end > dst_vma->vm_end)
+ return false;
/*
* Check the vma is registered in uffd, this is required to
@@ -40,11 +32,122 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ return false;
+
+ return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ vma = ERR_PTR(-ENOENT);
+ else if (!(vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(vma)))
+ vma = ERR_PTR(-ENOMEM);
+
+ return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (vma) {
+ /*
+ * lock_vma_under_rcu() only checks anon_vma for private
+ * anonymous mappings. But we need to ensure it is assigned in
+ * private file-backed vmas as well.
+ */
+ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
+ vma_end_read(vma);
+ else
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = find_vma_and_prepare_anon(mm, address);
+ if (!IS_ERR(vma)) {
+ /*
+ * We cannot use vma_start_read() as it may fail due to
+ * false locked (see comment in vma_start_read()). We
+ * can avoid that by directly locking vm_lock under
+ * mmap_lock, which guarantees that nobody can lock the
+ * vma for write (vma_start_write()) under us.
+ */
+ down_read(&vma->vm_lock->lock);
+ }
+
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = lock_vma(dst_mm, dst_start);
+ if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ vma_end_read(dst_vma);
+ return ERR_PTR(-ENOENT);
+}
+
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ vma_end_read(vma);
+}
+#else
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ mmap_read_lock(dst_mm);
+ dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+ if (IS_ERR(dst_vma))
+ goto out_unlock;
+
+ if (validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ dst_vma = ERR_PTR(-ENOENT);
+out_unlock:
+ mmap_read_unlock(dst_mm);
return dst_vma;
}
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ mmap_read_unlock(vma->vm_mm);
+}
+#endif
+
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -124,7 +227,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
* Must happen after rmap, as mm_counter() checks mapping (via
* PageAnon()), which is set by __page_set_anon_rmap().
*/
- inc_mm_counter(dst_mm, mm_counter(page));
+ inc_mm_counter(dst_mm, mm_counter(folio));
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -350,18 +453,18 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
+ struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
- int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
unsigned long src_addr, dst_addr;
@@ -379,7 +482,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* feature is not supported.
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
return -EINVAL;
}
@@ -402,24 +506,28 @@ retry:
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
+
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
- goto out_unlock;
+ goto out_unlock_vma;
- vm_shared = dst_vma->vm_flags & VM_SHARED;
- }
-
- /*
- * If not shared, ensure the dst_vma has a anon_vma.
- */
- err = -ENOMEM;
- if (!vm_shared) {
- if (unlikely(anon_vma_prepare(dst_vma)))
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
}
@@ -463,7 +571,8 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
err = copy_folio_from_user(folio,
@@ -472,16 +581,6 @@ retry:
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- if (mmap_changing && atomic_read(mmap_changing)) {
- err = -EAGAIN;
- break;
- }
dst_vma = NULL;
goto retry;
@@ -501,7 +600,9 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -512,11 +613,11 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -564,13 +665,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
return err;
}
-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@@ -593,24 +694,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
copied = 0;
folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
- goto out_unlock;
-
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -EINVAL;
@@ -633,8 +734,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
- len, mmap_changing, flags);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -642,16 +743,6 @@ retry:
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
- /*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
- */
- err = -ENOMEM;
- if (!(dst_vma->vm_flags & VM_SHARED) &&
- unlikely(anon_vma_prepare(dst_vma)))
- goto out_unlock;
-
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
@@ -693,7 +784,8 @@ retry:
if (unlikely(err == -ENOENT)) {
void *kaddr;
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
kaddr = kmap_local_folio(folio, 0);
@@ -723,7 +815,8 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -733,34 +826,33 @@ out:
return copied ? copied : err;
}
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags)
+ uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ return mfill_atomic(ctx, dst_start, src_start, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long start,
+ unsigned long len)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
}
@@ -793,10 +885,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp)
{
+ struct mm_struct *dst_mm = ctx->mm;
unsigned long end = start + len;
unsigned long _start, _end;
struct vm_area_struct *dst_vma;
@@ -820,8 +912,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
@@ -850,6 +943,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
}
out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}
@@ -959,6 +1053,33 @@ static int move_swap_pte(struct mm_struct *mm,
return 0;
}
+static int move_zeropage_pte(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+ pte_t zero_pte;
+
+ double_pt_lock(dst_ptl, src_ptl);
+ if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+ !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+
+ zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ptep_clear_flush(src_vma, src_addr, src_pte);
+ set_pte_at(mm, dst_addr, dst_pte, zero_pte);
+ double_pt_unlock(dst_ptl, src_ptl);
+
+ return 0;
+}
+
+
/*
* The mmap_lock for reading is held by the caller. Just move the page
* from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1041,6 +1162,14 @@ retry:
}
if (pte_present(orig_src_pte)) {
+ if (is_zero_pfn(pte_pfn(orig_src_pte))) {
+ err = move_zeropage_pte(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte,
+ dst_ptl, src_ptl);
+ goto out;
+ }
+
/*
* Pin and lock both source folio and anon_vma. Since we are in
* RCU read section, we can't block, so on contention have to
@@ -1224,27 +1353,136 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
return -EINVAL;
+ return 0;
+}
+
+static __always_inline
+int find_vmas_mm_locked(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = find_vma_and_prepare_anon(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
+ /* Skip finding src_vma if src_start is in dst_vma */
+ if (src_start >= vma->vm_start && src_start < vma->vm_end)
+ goto out_success;
+
+ vma = vma_lookup(mm, src_start);
+ if (!vma)
+ return -ENOENT;
+out_success:
+ *src_vmap = vma;
+ return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ vma = lock_vma(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
/*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
+ * Skip finding src_vma if src_start is in dst_vma. This also ensures
+ * that we don't lock the same vma twice.
*/
- if (unlikely(anon_vma_prepare(dst_vma)))
- return -ENOMEM;
+ if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+ *src_vmap = vma;
+ return 0;
+ }
- return 0;
+ /*
+ * Using lock_vma() to get src_vma can lead to following deadlock:
+ *
+ * Thread1 Thread2
+ * ------- -------
+ * vma_start_read(dst_vma)
+ * mmap_write_lock(mm)
+ * vma_start_write(src_vma)
+ * vma_start_read(src_vma)
+ * mmap_read_lock(mm)
+ * vma_start_write(dst_vma)
+ */
+ *src_vmap = lock_vma_under_rcu(mm, src_start);
+ if (likely(*src_vmap))
+ return 0;
+
+ /* Undo any locking and retry in mmap_lock critical section */
+ vma_end_read(*dst_vmap);
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (!err) {
+ /*
+ * See comment in lock_vma() as to why not using
+ * vma_start_read() here.
+ */
+ down_read(&(*dst_vmap)->vm_lock->lock);
+ if (*dst_vmap != *src_vmap)
+ down_read(&(*src_vmap)->vm_lock->lock);
+ }
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ vma_end_read(src_vma);
+ if (src_vma != dst_vma)
+ vma_end_read(dst_vma);
+}
+
+#else
+
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ int err;
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (err)
+ mmap_read_unlock(mm);
+ return err;
}
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ mmap_assert_locked(src_vma->vm_mm);
+ mmap_read_unlock(dst_vma->vm_mm);
+}
+#endif
+
/**
* move_pages - move arbitrary anonymous pages of an existing vma
* @ctx: pointer to the userfaultfd context
- * @mm: the address space to move pages
* @dst_start: start of the destination virtual memory range
* @src_start: start of the source virtual memory range
* @len: length of the virtual memory range
* @mode: flags from uffdio_move.mode
*
- * Must be called with mmap_lock held for read.
+ * It will either use the mmap_lock in read mode or per-vma locks
*
* move_pages() remaps arbitrary anonymous pages atomically in zero
* copy. It only works on non shared anonymous pages because those can
@@ -1312,10 +1550,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* could be obtained. This is the only additional complexity added to
* the rmap code to provide this anonymous page remapping functionality.
*/
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 mode)
{
+ struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
unsigned long src_addr, dst_addr;
pmd_t *src_pmd, *dst_pmd;
@@ -1333,28 +1571,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
WARN_ON_ONCE(dst_start + len <= dst_start))
goto out;
+ err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
+ if (err)
+ goto out;
+
+ /* Re-check after taking map_changing_lock */
+ err = -EAGAIN;
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing)))
+ goto out_unlock;
/*
* Make sure the vma is not shared, that the src and dst remap
* ranges are both valid and fully within a single existing
* vma.
*/
- src_vma = find_vma(mm, src_start);
- if (!src_vma || (src_vma->vm_flags & VM_SHARED))
- goto out;
- if (src_start < src_vma->vm_start ||
- src_start + len > src_vma->vm_end)
- goto out;
+ err = -EINVAL;
+ if (src_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (src_start + len > src_vma->vm_end)
+ goto out_unlock;
- dst_vma = find_vma(mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out;
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out;
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
err = validate_move_areas(ctx, src_vma, dst_vma);
if (err)
- goto out;
+ goto out_unlock;
for (src_addr = src_start, dst_addr = dst_start;
src_addr < src_start + len;) {
@@ -1404,19 +1648,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
err = -ENOENT;
break;
}
- /* Avoid moving zeropages for now */
- if (is_huge_zero_pmd(*src_pmd)) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
- }
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
- if (!folio || !PageAnonExclusive(&folio->page)) {
+ if (!folio || (!is_huge_zero_page(&folio->page) &&
+ !PageAnonExclusive(&folio->page))) {
spin_unlock(ptl);
err = -EBUSY;
break;
@@ -1476,6 +1715,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
moved += step_size;
}
+out_unlock:
+ up_read(&ctx->map_changing_lock);
+ uffd_move_unlock(dst_vma, src_vma);
out:
VM_WARN_ON(moved < 0);
VM_WARN_ON(err > 0);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc..198d623054c5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1744,17 +1744,17 @@ bool folio_isolate_lru(struct folio *folio)
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
bool too_many;
if (current_is_kswapd())
- return 0;
+ return false;
if (!writeback_throttling_sane(sc))
- return 0;
+ return false;
if (file) {
inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
@@ -1998,7 +1998,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_inactive);
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
@@ -2412,7 +2412,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
unsigned long lruvec_size;
unsigned long low, min;
unsigned long scan;
@@ -2879,38 +2879,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
#endif
-static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
{
int i;
int hist;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
- if (walk) {
- hist = lru_hist_from_seq(walk->max_seq);
+ hist = lru_hist_from_seq(walk->seq);
- for (i = 0; i < NR_MM_STATS; i++) {
- WRITE_ONCE(mm_state->stats[hist][i],
- mm_state->stats[hist][i] + walk->mm_stats[i]);
- walk->mm_stats[i] = 0;
- }
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(mm_state->stats[hist][i],
+ mm_state->stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
}
if (NR_HIST_GENS > 1 && last) {
- hist = lru_hist_from_seq(mm_state->seq + 1);
+ hist = lru_hist_from_seq(walk->seq + 1);
for (i = 0; i < NR_MM_STATS; i++)
WRITE_ONCE(mm_state->stats[hist][i], 0);
}
}
-static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
- struct mm_struct **iter)
+static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
{
bool first = false;
bool last = false;
struct mm_struct *mm = NULL;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
@@ -2927,9 +2926,9 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
*/
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
- if (walk->max_seq <= mm_state->seq)
+ if (walk->seq <= mm_state->seq)
goto done;
if (!mm_state->head)
@@ -2954,12 +2953,12 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
} while (!(mm = get_next_mm(walk)));
done:
if (*iter || last)
- reset_mm_stats(lruvec, walk, last);
+ reset_mm_stats(walk, last);
spin_unlock(&mm_list->lock);
if (mm && first)
- reset_bloom_filter(mm_state, walk->max_seq + 1);
+ reset_bloom_filter(mm_state, walk->seq + 1);
if (*iter)
mmput_async(*iter);
@@ -2969,7 +2968,7 @@ done:
return last;
}
-static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
{
bool success = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -2978,13 +2977,12 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
- if (max_seq > mm_state->seq) {
+ if (seq > mm_state->seq) {
mm_state->head = NULL;
mm_state->tail = NULL;
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
- reset_mm_stats(lruvec, NULL, true);
success = true;
}
@@ -3159,9 +3157,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
walk->nr_pages[new_gen][type][zone] += delta;
}
-static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+static void reset_batch_size(struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3331,7 +3330,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
if (!pte)
@@ -3398,7 +3398,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3529,7 +3530,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
- if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
+ if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
continue;
walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3540,7 +3541,7 @@ restart:
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
- update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
+ update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
}
walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3591,7 +3592,7 @@ done:
return -EAGAIN;
}
-static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
{
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
@@ -3600,6 +3601,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
};
int err;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3610,7 +3612,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
err = -EBUSY;
/* another thread might have called inc_max_seq() */
- if (walk->max_seq != max_seq)
+ if (walk->seq != max_seq)
break;
/* folio_update_gen() requires stable folio_memcg() */
@@ -3628,7 +3630,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
- reset_batch_size(lruvec, walk);
+ reset_batch_size(walk);
spin_unlock_irq(&lruvec->lru_lock);
}
@@ -3747,7 +3749,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
bool success;
@@ -3755,14 +3757,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
- if (max_seq < READ_ONCE(lrugen->max_seq))
+ if (seq < READ_ONCE(lrugen->max_seq))
return false;
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
- success = max_seq == lrugen->max_seq;
+ success = seq == lrugen->max_seq;
if (!success)
goto unlock;
@@ -3815,8 +3817,8 @@ unlock:
return success;
}
-static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, bool force_scan)
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
+ bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3824,13 +3826,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
- VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, can_swap, force_scan);
/* see the comment in iterate_mm_list() */
- if (max_seq <= READ_ONCE(mm_state->seq))
+ if (seq <= READ_ONCE(mm_state->seq))
return false;
/*
@@ -3840,29 +3842,29 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* is less efficient, but it avoids bursty page faults.
*/
if (!should_walk_mmu()) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk = set_mm_walk(NULL, true);
if (!walk) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk->lruvec = lruvec;
- walk->max_seq = max_seq;
+ walk->seq = seq;
walk->can_swap = can_swap;
walk->force_scan = force_scan;
do {
- success = iterate_mm_list(lruvec, walk, &mm);
+ success = iterate_mm_list(walk, &mm);
if (mm)
- walk_mm(lruvec, mm, walk);
+ walk_mm(mm, walk);
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, can_swap, force_scan);
WARN_ON_ONCE(!success);
}
@@ -4287,7 +4289,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* swapping inhibited */
+ /* swap constrained */
if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
@@ -4456,9 +4458,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
DEFINE_MIN_SEQ(lruvec);
/*
- * Try to make the obvious choice first. When anon and file are both
- * available from the same generation, interpret swappiness 1 as file
- * first and 200 as anon first.
+ * Try to make the obvious choice first, and if anon and file are both
+ * available from the same generation,
+ * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
+ * first.
+ * 2. If !__GFP_IO, file first since clean pagecache is more likely to
+ * exist than clean swapcache.
*/
if (!swappiness)
type = LRU_GEN_FILE;
@@ -4468,6 +4473,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
type = LRU_GEN_FILE;
else if (swappiness == 200)
type = LRU_GEN_ANON;
+ else if (!(sc->gfp_mask & __GFP_IO))
+ type = LRU_GEN_FILE;
else
type = get_type_to_scan(lruvec, swappiness, &tier);
@@ -4558,8 +4565,10 @@ retry:
move_folios_to_lru(lruvec, &list);
walk = current->reclaim_state->mm_walk;
- if (walk && walk->batched)
- reset_batch_size(lruvec, walk);
+ if (walk && walk->batched) {
+ walk->lruvec = lruvec;
+ reset_batch_size(walk);
+ }
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -4584,14 +4593,13 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+ bool can_swap, unsigned long *nr_to_scan)
{
int gen, type, zone;
unsigned long old = 0;
unsigned long young = 0;
unsigned long total = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
/* whether this lruvec is completely out of cold folios */
@@ -4619,13 +4627,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
}
}
- /* try to scrape all its memory if this memcg was deleted */
- if (!mem_cgroup_online(memcg)) {
- *nr_to_scan = total;
- return false;
- }
-
- *nr_to_scan = total >> sc->priority;
+ *nr_to_scan = total;
/*
* The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4657,6 +4659,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
*/
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
+ bool success;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4664,15 +4667,18 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
- return nr_to_scan;
+ success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
- /* skip the aging path at the default priority */
- if (sc->priority == DEF_PRIORITY)
+ /* try to scrape all its memory if this memcg was deleted */
+ if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
- /* skip this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ /* try to get away with not aging at the default priority */
+ if (!success || sc->priority == DEF_PRIORITY)
+ return nr_to_scan >> sc->priority;
+
+ /* stop scanning this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4712,10 +4718,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long scanned = 0;
int swappiness = get_swappiness(lruvec, sc);
- /* clean file folios are more likely to exist */
- if (swappiness && !(sc->gfp_mask & __GFP_IO))
- swappiness = 1;
-
while (true) {
int delta;
@@ -4878,7 +4880,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
{
int priority;
unsigned long reclaimable;
- struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
@@ -4888,7 +4889,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_swappiness(lruvec, sc))
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
@@ -5332,7 +5333,7 @@ static const struct seq_operations lru_gen_seq_ops = {
.show = lru_gen_seq_show,
};
-static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+static int run_aging(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
@@ -5347,7 +5348,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
return -ERANGE;
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+ try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
return 0;
}
@@ -5415,7 +5416,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
switch (cmd) {
case '+':
- err = run_aging(lruvec, seq, sc, swappiness, opt);
+ err = run_aging(lruvec, seq, swappiness, opt);
break;
case '-':
err = run_eviction(lruvec, seq, sc, swappiness, opt);
@@ -6796,6 +6797,7 @@ restart:
bool raise_priority = true;
bool balanced;
bool ret;
+ bool was_frozen;
sc.reclaim_idx = highest_zoneidx;
@@ -6894,9 +6896,9 @@ restart:
/* Check if kswapd should be suspending */
__fs_reclaim_release(_THIS_IP_);
- ret = try_to_freeze();
+ ret = kthread_freezable_should_stop(&was_frozen);
__fs_reclaim_acquire(_THIS_IP_);
- if (ret || kthread_should_stop())
+ if (was_frozen || ret)
break;
/*
@@ -7102,7 +7104,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
- bool ret;
+ bool was_frozen;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
@@ -7119,15 +7121,14 @@ kswapd_try_sleep:
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
- ret = try_to_freeze();
- if (kthread_should_stop())
+ if (kthread_freezable_should_stop(&was_frozen))
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (ret)
+ if (was_frozen)
continue;
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index db4625af65fb..62fe307521c9 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor;
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
static u64 zswap_reject_kmemcache_fail;
-/* Duplicate store was encountered (rare) */
-static u64 zswap_duplicate_entry;
/* Shrinker work queue */
static struct workqueue_struct *shrink_wq;
@@ -141,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);
-static bool zswap_exclusive_loads_enabled = IS_ENABLED(
- CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
-module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
-
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
@@ -199,12 +193,6 @@ struct zswap_pool {
*
* rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
- * refcount - the number of outstanding reference to the entry. This is needed
- * to protect against premature freeing of the entry by code
- * concurrent calls to load, invalidate, and writeback. The lock
- * for the zswap_tree structure that contains the entry must
- * be held while changing the refcount. Since the lock must
- * be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
* pool and lru are invalid and must be ignored.
@@ -217,7 +205,6 @@ struct zswap_pool {
struct zswap_entry {
struct rb_node rbnode;
swp_entry_t swpentry;
- int refcount;
unsigned int length;
struct zswap_pool *pool;
union {
@@ -228,17 +215,13 @@ struct zswap_entry {
struct list_head lru;
};
-/*
- * The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
- * - the refcount field of each entry in the tree
- */
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
static LIST_HEAD(zswap_pools);
@@ -265,15 +248,16 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+{
+ return &zswap_trees[swp_type(swp)][swp_offset(swp)
+ >> SWAP_ADDRESS_SPACE_SHIFT];
+}
+
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
-static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree);
-static int zswap_pool_get(struct zswap_pool *pool);
-static void zswap_pool_put(struct zswap_pool *pool);
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -313,702 +297,12 @@ static void zswap_update_total_size(void)
zswap_pool_total_size = total;
}
-/* should be called under RCU */
-#ifdef CONFIG_MEMCG
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
-}
-#else
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return NULL;
-}
-#endif
-
-static inline int entry_to_nid(struct zswap_entry *entry)
-{
- return page_to_nid(virt_to_page(entry));
-}
-
-void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
-
- /* lock out zswap pools list modification */
- spin_lock(&zswap_pools_lock);
- list_for_each_entry(pool, &zswap_pools, list) {
- if (pool->next_shrink == memcg)
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- }
- spin_unlock(&zswap_pools_lock);
-}
-
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
- struct zswap_entry *entry;
- entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
- if (!entry)
- return NULL;
- entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
- return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
- kmem_cache_free(zswap_entry_cache, entry);
-}
-
-/*********************************
-* zswap lruvec functions
-**********************************/
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
- struct lruvec *lruvec;
-
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-}
-
-/*********************************
-* lru functions
-**********************************/
-static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- /*
- * Note that it is safe to use rcu_read_lock() here, even in the face of
- * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
- * used in list_lru lookup, only two scenarios are possible:
- *
- * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
- * new entry will be reparented to memcg's parent's list_lru.
- * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
- * new entry will be added directly to memcg's parent's list_lru.
- *
- * Similar reasoning holds for list_lru_del() and list_lru_putback().
- */
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
- rcu_read_unlock();
-}
-
-static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_del(list_lru, &entry->lru, nid, memcg);
- rcu_read_unlock();
-}
-
-static void zswap_lru_putback(struct list_lru *list_lru,
- struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- spinlock_t *lock = &list_lru->node[nid].lock;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- spin_lock(lock);
- /* we cannot use list_lru_add here, because it increments node's lru count */
- list_lru_putback(list_lru, &entry->lru, nid, memcg);
- spin_unlock(lock);
-
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
- /* increment the protection area to account for the LRU rotation. */
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- rcu_read_unlock();
-}
-
-/*********************************
-* rbtree functions
-**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
-{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
-}
-
-static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
- return true;
- }
- return false;
-}
-
-static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
-{
- int i = 0;
-
- if (ZSWAP_NR_ZPOOLS > 1)
- i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
-
- return entry->pool->zpools[i];
-}
-
-/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_entry *entry)
-{
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zpool_free(zswap_find_zpool(entry), entry->handle);
- atomic_dec(&entry->pool->nr_stored);
- zswap_pool_put(entry->pool);
- }
- if (entry->objcg) {
- obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
- obj_cgroup_put(entry->objcg);
- }
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
-}
-
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
- entry->refcount++;
-}
-
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
-static void zswap_entry_put(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- int refcount = --entry->refcount;
-
- WARN_ON_ONCE(refcount < 0);
- if (refcount == 0) {
- WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
- zswap_free_entry(entry);
- }
-}
-
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
- pgoff_t offset)
-{
- struct zswap_entry *entry;
-
- entry = zswap_rb_search(root, offset);
- if (entry)
- zswap_entry_get(entry);
-
- return entry;
-}
-
-/*********************************
-* shrinker functions
-**********************************/
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg);
-
-static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
- struct zswap_pool *pool = shrinker->private_data;
- bool encountered_page_in_swapcache = false;
-
- if (!zswap_shrinker_enabled ||
- !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&pool->list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
- &encountered_page_in_swapcache);
-
- if (encountered_page_in_swapcache)
- return SHRINK_STOP;
-
- return shrink_ret ? shrink_ret : SHRINK_STOP;
-}
-
-static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct zswap_pool *pool = shrinker->private_data;
- struct mem_cgroup *memcg = sc->memcg;
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
-
- if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
- return 0;
-
-#ifdef CONFIG_MEMCG_KMEM
- mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
-#else
- /* use pool stats instead of memcg stats */
- nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
- nr_stored = atomic_read(&pool->nr_stored);
-#endif
-
- if (!nr_stored)
- return 0;
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
- /*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
- */
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
-
- /*
- * Scale the number of freeable pages by the memory saving factor.
- * This ensures that the better zswap compresses memory, the fewer
- * pages we will evict to swap (as it will otherwise incur IO for
- * relatively small memory saving).
- */
- return mult_frac(nr_freeable, nr_backing, nr_stored);
-}
-
-static void zswap_alloc_shrinker(struct zswap_pool *pool)
-{
- pool->shrinker =
- shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
- if (!pool->shrinker)
- return;
-
- pool->shrinker->private_data = pool;
- pool->shrinker->scan_objects = zswap_shrinker_scan;
- pool->shrinker->count_objects = zswap_shrinker_count;
- pool->shrinker->batch = 0;
- pool->shrinker->seeks = DEFAULT_SEEKS;
-}
-
-/*********************************
-* per-cpu code
-**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
- int ret;
-
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
-
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
- pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
- ret = PTR_ERR(acomp);
- goto acomp_fail;
- }
- acomp_ctx->acomp = acomp;
-
- req = acomp_request_alloc(acomp_ctx->acomp);
- if (!req) {
- pr_err("could not alloc crypto acomp_request %s\n",
- pool->tfm_name);
- ret = -ENOMEM;
- goto req_fail;
- }
- acomp_ctx->req = req;
-
- crypto_init_wait(&acomp_ctx->wait);
- /*
- * if the backend of acomp is async zip, crypto_req_done() will wakeup
- * crypto_wait_req(); if the backend of acomp is scomp, the callback
- * won't be called, crypto_wait_req() will return without blocking.
- */
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &acomp_ctx->wait);
-
- return 0;
-
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
- return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
-
- return 0;
-}
-
/*********************************
* pool functions
**********************************/
-static struct zswap_pool *__zswap_pool_current(void)
-{
- struct zswap_pool *pool;
-
- pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
- WARN_ONCE(!pool && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_current(void)
-{
- assert_spin_locked(&zswap_pools_lock);
-
- return __zswap_pool_current();
-}
-
-static struct zswap_pool *zswap_pool_current_get(void)
-{
- struct zswap_pool *pool;
-
- rcu_read_lock();
-
- pool = __zswap_pool_current();
- if (!zswap_pool_get(pool))
- pool = NULL;
-
- rcu_read_unlock();
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_last_get(void)
-{
- struct zswap_pool *pool, *last = NULL;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- last = pool;
- WARN_ONCE(!last && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
- if (!zswap_pool_get(last))
- last = NULL;
-
- rcu_read_unlock();
-
- return last;
-}
-
-/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
-{
- struct zswap_pool *pool;
-
- assert_spin_locked(&zswap_pools_lock);
-
- list_for_each_entry_rcu(pool, &zswap_pools, list) {
- if (strcmp(pool->tfm_name, compressor))
- continue;
- /* all zpools share the same type */
- if (strcmp(zpool_get_type(pool->zpools[0]), type))
- continue;
- /* if we can't get it, it's about to be destroyed */
- if (!zswap_pool_get(pool))
- continue;
- return pool;
- }
-
- return NULL;
-}
-
-/*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- if (zswap_rb_erase(&tree->rbroot, entry))
- zswap_entry_put(tree, entry);
-}
-
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg)
-{
- struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
- bool *encountered_page_in_swapcache = (bool *)arg;
- struct zswap_tree *tree;
- pgoff_t swpoffset;
- enum lru_status ret = LRU_REMOVED_RETRY;
- int writeback_result;
-
- /*
- * Once the lru lock is dropped, the entry might get freed. The
- * swpoffset is copied to the stack, and entry isn't deref'd again
- * until the entry is verified to still be alive in the tree.
- */
- swpoffset = swp_offset(entry->swpentry);
- tree = zswap_trees[swp_type(entry->swpentry)];
- list_lru_isolate(l, item);
- /*
- * It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
- */
- spin_unlock(lock);
-
- /* Check for invalidate() race */
- spin_lock(&tree->lock);
- if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
- goto unlock;
-
- /* Hold a reference to prevent a free during writeback */
- zswap_entry_get(entry);
- spin_unlock(&tree->lock);
-
- writeback_result = zswap_writeback_entry(entry, tree);
-
- spin_lock(&tree->lock);
- if (writeback_result) {
- zswap_reject_reclaim_fail++;
- zswap_lru_putback(&entry->pool->list_lru, entry);
- ret = LRU_RETRY;
-
- /*
- * Encountering a page already in swap cache is a sign that we are shrinking
- * into the warmer region. We should terminate shrinking (if we're in the dynamic
- * shrinker context).
- */
- if (writeback_result == -EEXIST && encountered_page_in_swapcache)
- *encountered_page_in_swapcache = true;
-
- goto put_unlock;
- }
- zswap_written_back_pages++;
-
- if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPWB);
-
- count_vm_event(ZSWPWB);
- /*
- * Writeback started successfully, the page now belongs to the
- * swapcache. Drop the entry from zswap - unless invalidate already
- * took it out while we had the tree->lock released for IO.
- */
- zswap_invalidate_entry(tree, entry);
-
-put_unlock:
- /* Drop local reference */
- zswap_entry_put(tree, entry);
-unlock:
- spin_unlock(&tree->lock);
- spin_lock(lock);
- return ret;
-}
-
-static int shrink_memcg(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
- int nid, shrunk = 0;
-
- if (!mem_cgroup_zswap_writeback_enabled(memcg))
- return -EINVAL;
-
- /*
- * Skip zombies because their LRUs are reparented and we would be
- * reclaiming from the parent instead of the dead memcg.
- */
- if (memcg && !mem_cgroup_online(memcg))
- return -ENOENT;
-
- pool = zswap_pool_current_get();
- if (!pool)
- return -EINVAL;
-
- for_each_node_state(nid, N_NORMAL_MEMORY) {
- unsigned long nr_to_walk = 1;
-
- shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
- &shrink_memcg_cb, NULL, &nr_to_walk);
- }
- zswap_pool_put(pool);
- return shrunk ? 0 : -EAGAIN;
-}
-
-static void shrink_worker(struct work_struct *w)
-{
- struct zswap_pool *pool = container_of(w, typeof(*pool),
- shrink_work);
- struct mem_cgroup *memcg;
- int ret, failures = 0;
-
- /* global reclaim will select cgroup in a round-robin fashion. */
- do {
- spin_lock(&zswap_pools_lock);
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- memcg = pool->next_shrink;
-
- /*
- * We need to retry if we have gone through a full round trip, or if we
- * got an offline memcg (or else we risk undoing the effect of the
- * zswap memcg offlining cleanup callback). This is not catastrophic
- * per se, but it will keep the now offlined memcg hostage for a while.
- *
- * Note that if we got an online memcg, we will keep the extra
- * reference in case the original reference obtained by mem_cgroup_iter
- * is dropped by the zswap memcg offlining callback, ensuring that the
- * memcg is not killed when we are reclaiming.
- */
- if (!memcg) {
- spin_unlock(&zswap_pools_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
-
- if (!mem_cgroup_tryget_online(memcg)) {
- /* drop the reference from mem_cgroup_iter() */
- mem_cgroup_iter_break(NULL, memcg);
- pool->next_shrink = NULL;
- spin_unlock(&zswap_pools_lock);
-
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
- spin_unlock(&zswap_pools_lock);
-
- ret = shrink_memcg(memcg);
- /* drop the extra reference */
- mem_cgroup_put(memcg);
-
- if (ret == -EINVAL)
- break;
- if (ret && ++failures == MAX_RECLAIM_RETRIES)
- break;
-
-resched:
- cond_resched();
- } while (!zswap_can_accept());
- zswap_pool_put(pool);
-}
+static void zswap_alloc_shrinker(struct zswap_pool *pool);
+static void shrink_worker(struct work_struct *w);
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
@@ -1155,14 +449,6 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
kfree(pool);
}
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
-{
- if (!pool)
- return 0;
-
- return kref_get_unless_zero(&pool->kref);
-}
-
static void __zswap_pool_release(struct work_struct *work)
{
struct zswap_pool *pool = container_of(work, typeof(*pool),
@@ -1177,6 +463,8 @@ static void __zswap_pool_release(struct work_struct *work)
zswap_pool_destroy(pool);
}
+static struct zswap_pool *zswap_pool_current(void);
+
static void __zswap_pool_empty(struct kref *kref)
{
struct zswap_pool *pool;
@@ -1195,11 +483,92 @@ static void __zswap_pool_empty(struct kref *kref)
spin_unlock(&zswap_pools_lock);
}
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+ if (!pool)
+ return 0;
+
+ return kref_get_unless_zero(&pool->kref);
+}
+
static void zswap_pool_put(struct zswap_pool *pool)
{
kref_put(&pool->kref, __zswap_pool_empty);
}
+static struct zswap_pool *__zswap_pool_current(void)
+{
+ struct zswap_pool *pool;
+
+ pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+ WARN_ONCE(!pool && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+ assert_spin_locked(&zswap_pools_lock);
+
+ return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+ struct zswap_pool *pool;
+
+ rcu_read_lock();
+
+ pool = __zswap_pool_current();
+ if (!zswap_pool_get(pool))
+ pool = NULL;
+
+ rcu_read_unlock();
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+ struct zswap_pool *pool, *last = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ last = pool;
+ WARN_ONCE(!last && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+ if (!zswap_pool_get(last))
+ last = NULL;
+
+ rcu_read_unlock();
+
+ return last;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+
+ assert_spin_locked(&zswap_pools_lock);
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ if (strcmp(pool->tfm_name, compressor))
+ continue;
+ /* all zpools share the same type */
+ if (strcmp(zpool_get_type(pool->zpools[0]), type))
+ continue;
+ /* if we can't get it, it's about to be destroyed */
+ if (!zswap_pool_get(pool))
+ continue;
+ return pool;
+ }
+
+ return NULL;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -1356,7 +725,372 @@ static int zswap_enabled_param_set(const char *val,
return ret;
}
-static void __zswap_load(struct zswap_entry *entry, struct page *page)
+/*********************************
+* lru functions
+**********************************/
+
+/* should be called under RCU */
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return NULL;
+}
+#endif
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+ return page_to_nid(virt_to_page(entry));
+}
+
+static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ atomic_long_t *nr_zswap_protected;
+ unsigned long lru_size, old, new;
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ /*
+ * Note that it is safe to use rcu_read_lock() here, even in the face of
+ * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
+ * used in list_lru lookup, only two scenarios are possible:
+ *
+ * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * new entry will be reparented to memcg's parent's list_lru.
+ * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * new entry will be added directly to memcg's parent's list_lru.
+ *
+ * Similar reasoning holds for list_lru_del().
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+ /* Update the protection area */
+ lru_size = list_lru_count_one(list_lru, nid, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+ old = atomic_long_inc_return(nr_zswap_protected);
+ /*
+ * Decay to avoid overflow and adapt to changing workloads.
+ * This is based on LRU reclaim cost decaying heuristics.
+ */
+ do {
+ new = old > lru_size / 4 ? old / 2 : old;
+ } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
+ rcu_read_unlock();
+}
+
+static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_del(list_lru, &entry->lru, nid, memcg);
+ rcu_read_unlock();
+}
+
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+ struct lruvec *lruvec;
+
+ if (folio) {
+ lruvec = folio_lruvec(folio);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ }
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+
+ /* lock out zswap pools list modification */
+ spin_lock(&zswap_pools_lock);
+ list_for_each_entry(pool, &zswap_pools, list) {
+ if (pool->next_shrink == memcg)
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ }
+ spin_unlock(&zswap_pools_lock);
+}
+
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct zswap_entry *entry;
+ pgoff_t entry_offset;
+
+ while (node) {
+ entry = rb_entry(node, struct zswap_entry, rbnode);
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
+ node = node->rb_left;
+ else if (entry_offset < offset)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+ struct zswap_entry **dupentry)
+{
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
+
+ while (*link) {
+ parent = *link;
+ myentry = rb_entry(parent, struct zswap_entry, rbnode);
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
+ link = &(*link)->rb_left;
+ else if (myentry_offset < entry_offset)
+ link = &(*link)->rb_right;
+ else {
+ *dupentry = myentry;
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&entry->rbnode, parent, link);
+ rb_insert_color(&entry->rbnode, root);
+ return 0;
+}
+
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+ struct zswap_entry *entry;
+ entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+ if (!entry)
+ return NULL;
+ RB_CLEAR_NODE(&entry->rbnode);
+ return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+ kmem_cache_free(zswap_entry_cache, entry);
+}
+
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+ int i = 0;
+
+ if (ZSWAP_NR_ZPOOLS > 1)
+ i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+ return entry->pool->zpools[i];
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_entry_free(struct zswap_entry *entry)
+{
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zswap_lru_del(&entry->pool->list_lru, entry);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
+ atomic_dec(&entry->pool->nr_stored);
+ zswap_pool_put(entry->pool);
+ }
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_update_total_size();
+}
+
+/*
+ * The caller hold the tree lock and search the entry from the tree,
+ * so it must be on the tree, remove it from the tree and free it.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_entry_free(entry);
+}
+
+/*********************************
+* compressed storage functions
+**********************************/
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+ int ret;
+
+ mutex_init(&acomp_ctx->mutex);
+
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return -ENOMEM;
+
+ acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp));
+ ret = PTR_ERR(acomp);
+ goto acomp_fail;
+ }
+ acomp_ctx->acomp = acomp;
+
+ req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ ret = -ENOMEM;
+ goto req_fail;
+ }
+ acomp_ctx->req = req;
+
+ crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ return 0;
+
+req_fail:
+ crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+ kfree(acomp_ctx->buffer);
+ return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx)) {
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+ kfree(acomp_ctx->buffer);
+ }
+
+ return 0;
+}
+
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+ struct scatterlist input, output;
+ unsigned int dlen = PAGE_SIZE;
+ unsigned long handle;
+ struct zpool *zpool;
+ char *buf;
+ gfp_t gfp;
+ int ret;
+ u8 *dst;
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+ mutex_lock(&acomp_ctx->mutex);
+
+ dst = acomp_ctx->buffer;
+ sg_init_table(&input, 1);
+ sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+ /*
+ * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+ * and hardware-accelerators may won't check the dst buffer size, so
+ * giving the dst buffer with enough length to avoid buffer overflow.
+ */
+ sg_init_one(&output, dst, PAGE_SIZE * 2);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+ /*
+ * it maybe looks a little bit silly that we send an asynchronous request,
+ * then wait for its completion synchronously. This makes the process look
+ * synchronous in fact.
+ * Theoretically, acomp supports users send multiple acomp requests in one
+ * acomp instance, then get those requests done simultaneously. but in this
+ * case, zswap actually does store and load page by page, there is no
+ * existing method to send the second page before the first page is done
+ * in one thread doing zwap.
+ * but in different threads running on different cpu, we have different
+ * acomp instance, so multiple threads can do (de)compression in parallel.
+ */
+ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ if (ret) {
+ zswap_reject_compress_fail++;
+ goto unlock;
+ }
+
+ zpool = zswap_find_zpool(entry);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ if (ret == -ENOSPC) {
+ zswap_reject_compress_poor++;
+ goto unlock;
+ }
+ if (ret) {
+ zswap_reject_alloc_fail++;
+ goto unlock;
+ }
+
+ buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+ memcpy(buf, dst, dlen);
+ zpool_unmap_handle(zpool, handle);
+
+ entry->handle = handle;
+ entry->length = dlen;
+
+unlock:
+ mutex_unlock(&acomp_ctx->mutex);
+ return ret == 0;
+}
+
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
{
struct zpool *zpool = zswap_find_zpool(entry);
struct scatterlist input, output;
@@ -1401,9 +1135,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
* freed.
*/
static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree)
+ swp_entry_t swpentry)
{
- swp_entry_t swpentry = entry->swpentry;
+ struct zswap_tree *tree;
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1419,9 +1153,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return -ENOMEM;
/*
- * Found an existing folio, we raced with load/swapin. We generally
- * writeback cold folios from zswap, and swapin means the folio just
- * became hot. Skip this folio and let the caller find another one.
+ * Found an existing folio, we raced with swapin or concurrent
+ * shrinker. We generally writeback cold folios from zswap, and
+ * swapin means the folio just became hot, so skip this folio.
+ * For unlikely concurrent shrinker case, it will be unlinked
+ * and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
folio_put(folio);
@@ -1430,22 +1166,34 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*
* folio is locked, and the swapcache is now secured against
- * concurrent swapping to and from the slot. Verify that the
- * swap entry hasn't been invalidated and recycled behind our
- * backs (our zswap_entry reference doesn't prevent that), to
- * avoid overwriting a new swap folio with old compressed data.
+ * concurrent swapping to and from the slot, and concurrent
+ * swapoff so we can safely dereference the zswap tree here.
+ * Verify that the swap entry hasn't been invalidated and recycled
+ * behind our backs, to avoid overwriting a new swap folio with
+ * old compressed data. Only when this is successful can the entry
+ * be dereferenced.
*/
+ tree = swap_zswap_tree(swpentry);
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+ if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
folio_unlock(folio);
folio_put(folio);
return -ENOMEM;
}
+
+ /* Safe to deref entry after the entry is verified above. */
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
- __zswap_load(entry, &folio->page);
+ zswap_decompress(entry, &folio->page);
+
+ count_vm_event(ZSWPWB);
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPWB);
+
+ zswap_entry_free(entry);
/* folio is up to date */
folio_mark_uptodate(folio);
@@ -1460,6 +1208,268 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return 0;
}
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg)
+{
+ struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+ bool *encountered_page_in_swapcache = (bool *)arg;
+ swp_entry_t swpentry;
+ enum lru_status ret = LRU_REMOVED_RETRY;
+ int writeback_result;
+
+ /*
+ * As soon as we drop the LRU lock, the entry can be freed by
+ * a concurrent invalidation. This means the following:
+ *
+ * 1. We extract the swp_entry_t to the stack, allowing
+ * zswap_writeback_entry() to pin the swap entry and
+ * then validate the zwap entry against that swap entry's
+ * tree using pointer value comparison. Only when that
+ * is successful can the entry be dereferenced.
+ *
+ * 2. Usually, objects are taken off the LRU for reclaim. In
+ * this case this isn't possible, because if reclaim fails
+ * for whatever reason, we have no means of knowing if the
+ * entry is alive to put it back on the LRU.
+ *
+ * So rotate it before dropping the lock. If the entry is
+ * written back or invalidated, the free path will unlink
+ * it. For failures, rotation is the right thing as well.
+ *
+ * Temporary failures, where the same entry should be tried
+ * again immediately, almost never happen for this shrinker.
+ * We don't do any trylocking; -ENOMEM comes closest,
+ * but that's extremely rare and doesn't happen spuriously
+ * either. Don't bother distinguishing this case.
+ */
+ list_move_tail(item, &l->list);
+
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpentry is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpentry = entry->swpentry;
+
+ /*
+ * It's safe to drop the lock here because we return either
+ * LRU_REMOVED_RETRY or LRU_RETRY.
+ */
+ spin_unlock(lock);
+
+ writeback_result = zswap_writeback_entry(entry, swpentry);
+
+ if (writeback_result) {
+ zswap_reject_reclaim_fail++;
+ ret = LRU_RETRY;
+
+ /*
+ * Encountering a page already in swap cache is a sign that we are shrinking
+ * into the warmer region. We should terminate shrinking (if we're in the dynamic
+ * shrinker context).
+ */
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ ret = LRU_STOP;
+ *encountered_page_in_swapcache = true;
+ }
+ } else {
+ zswap_written_back_pages++;
+ }
+
+ spin_lock(lock);
+ return ret;
+}
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ unsigned long shrink_ret, nr_protected, lru_size;
+ struct zswap_pool *pool = shrinker->private_data;
+ bool encountered_page_in_swapcache = false;
+
+ if (!zswap_shrinker_enabled ||
+ !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+
+ /*
+ * Abort if we are shrinking into the protected region.
+ *
+ * This short-circuiting is necessary because if we have too many multiple
+ * concurrent reclaimers getting the freeable zswap object counts at the
+ * same time (before any of them made reasonable progress), the total
+ * number of reclaimed objects might be more than the number of unprotected
+ * objects (i.e the reclaimers will reclaim into the protected area of the
+ * zswap LRU).
+ */
+ if (nr_protected >= lru_size - sc->nr_to_scan) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+ &encountered_page_in_swapcache);
+
+ if (encountered_page_in_swapcache)
+ return SHRINK_STOP;
+
+ return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct zswap_pool *pool = shrinker->private_data;
+ struct mem_cgroup *memcg = sc->memcg;
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+ if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
+ return 0;
+
+#ifdef CONFIG_MEMCG_KMEM
+ mem_cgroup_flush_stats(memcg);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+#else
+ /* use pool stats instead of memcg stats */
+ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+ nr_stored = atomic_read(&pool->nr_stored);
+#endif
+
+ if (!nr_stored)
+ return 0;
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+ /*
+ * Subtract the lru size by an estimate of the number of pages
+ * that should be protected.
+ */
+ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+ /*
+ * Scale the number of freeable pages by the memory saving factor.
+ * This ensures that the better zswap compresses memory, the fewer
+ * pages we will evict to swap (as it will otherwise incur IO for
+ * relatively small memory saving).
+ */
+ return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+ pool->shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+ if (!pool->shrinker)
+ return;
+
+ pool->shrinker->private_data = pool;
+ pool->shrinker->scan_objects = zswap_shrinker_scan;
+ pool->shrinker->count_objects = zswap_shrinker_count;
+ pool->shrinker->batch = 0;
+ pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+ struct zswap_pool *pool;
+ int nid, shrunk = 0;
+
+ if (!mem_cgroup_zswap_writeback_enabled(memcg))
+ return -EINVAL;
+
+ /*
+ * Skip zombies because their LRUs are reparented and we would be
+ * reclaiming from the parent instead of the dead memcg.
+ */
+ if (memcg && !mem_cgroup_online(memcg))
+ return -ENOENT;
+
+ pool = zswap_pool_current_get();
+ if (!pool)
+ return -EINVAL;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long nr_to_walk = 1;
+
+ shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+ &shrink_memcg_cb, NULL, &nr_to_walk);
+ }
+ zswap_pool_put(pool);
+ return shrunk ? 0 : -EAGAIN;
+}
+
+static void shrink_worker(struct work_struct *w)
+{
+ struct zswap_pool *pool = container_of(w, typeof(*pool),
+ shrink_work);
+ struct mem_cgroup *memcg;
+ int ret, failures = 0;
+
+ /* global reclaim will select cgroup in a round-robin fashion. */
+ do {
+ spin_lock(&zswap_pools_lock);
+ pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+ memcg = pool->next_shrink;
+
+ /*
+ * We need to retry if we have gone through a full round trip, or if we
+ * got an offline memcg (or else we risk undoing the effect of the
+ * zswap memcg offlining cleanup callback). This is not catastrophic
+ * per se, but it will keep the now offlined memcg hostage for a while.
+ *
+ * Note that if we got an online memcg, we will keep the extra
+ * reference in case the original reference obtained by mem_cgroup_iter
+ * is dropped by the zswap memcg offlining callback, ensuring that the
+ * memcg is not killed when we are reclaiming.
+ */
+ if (!memcg) {
+ spin_unlock(&zswap_pools_lock);
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+
+ if (!mem_cgroup_tryget_online(memcg)) {
+ /* drop the reference from mem_cgroup_iter() */
+ mem_cgroup_iter_break(NULL, memcg);
+ pool->next_shrink = NULL;
+ spin_unlock(&zswap_pools_lock);
+
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+ spin_unlock(&zswap_pools_lock);
+
+ ret = shrink_memcg(memcg);
+ /* drop the extra reference */
+ mem_cgroup_put(memcg);
+
+ if (ret == -EINVAL)
+ break;
+ if (ret && ++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+resched:
+ cond_resched();
+ } while (!zswap_can_accept());
+ zswap_pool_put(pool);
+}
+
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned long *page;
@@ -1493,23 +1503,12 @@ static void zswap_fill_page(void *ptr, unsigned long value)
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
- struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry, *dupentry;
- struct scatterlist input, output;
- struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- struct zswap_pool *pool;
- struct zpool *zpool;
- unsigned int dlen = PAGE_SIZE;
- unsigned long handle, value;
- char *buf;
- u8 *src, *dst;
- gfp_t gfp;
- int ret;
+ struct zswap_pool *shrink_pool;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1518,24 +1517,8 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!tree)
- return false;
-
- /*
- * If this is a duplicate, it must be removed before attempting to store
- * it, otherwise, if the store fails the old page won't be removed from
- * the tree, and it might be written back overriding the new data.
- */
- spin_lock(&tree->lock);
- dupentry = zswap_rb_search(&tree->rbroot, offset);
- if (dupentry) {
- zswap_duplicate_entry++;
- zswap_invalidate_entry(tree, dupentry);
- }
- spin_unlock(&tree->lock);
-
if (!zswap_enabled)
- return false;
+ goto check_old;
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
@@ -1562,17 +1545,19 @@ bool zswap_store(struct folio *folio)
}
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
if (!entry) {
zswap_reject_kmemcache_fail++;
goto reject;
}
if (zswap_same_filled_pages_enabled) {
- src = kmap_local_page(page);
+ unsigned long value;
+ u8 *src;
+
+ src = kmap_local_folio(folio, 0);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_local(src);
- entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1598,67 +1583,11 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- /* compress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
- dst = acomp_ctx->buffer;
- sg_init_table(&input, 1);
- sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
-
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
- acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
- /*
- * it maybe looks a little bit silly that we send an asynchronous request,
- * then wait for its completion synchronously. This makes the process look
- * synchronous in fact.
- * Theoretically, acomp supports users send multiple acomp requests in one
- * acomp instance, then get those requests done simultaneously. but in this
- * case, zswap actually does store and load page by page, there is no
- * existing method to send the second page before the first page is done
- * in one thread doing zwap.
- * but in different threads running on different cpu, we have different
- * acomp instance, so multiple threads can do (de)compression in parallel.
- */
- ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
-
- if (ret) {
- zswap_reject_compress_fail++;
- goto put_dstmem;
- }
-
- /* store */
- zpool = zswap_find_zpool(entry);
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(zpool, dlen, gfp, &handle);
- if (ret == -ENOSPC) {
- zswap_reject_compress_poor++;
- goto put_dstmem;
- }
- if (ret) {
- zswap_reject_alloc_fail++;
- goto put_dstmem;
- }
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
- mutex_unlock(&acomp_ctx->mutex);
-
- /* populate entry */
- entry->swpentry = swp_entry(type, offset);
- entry->handle = handle;
- entry->length = dlen;
+ if (!zswap_compress(folio, entry))
+ goto put_pool;
insert_entry:
+ entry->swpentry = swp;
entry->objcg = objcg;
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
@@ -1669,15 +1598,12 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
/*
- * A duplicate entry should have been removed at the beginning of this
- * function. Since the swap entry should be pinned, if a duplicate is
- * found again here it means that something went wrong in the swap
- * cache.
+ * The folio may have been dirtied again, invalidate the
+ * possibly stale entry before inserting the new entry.
*/
- while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
- WARN_ON(1);
- zswap_duplicate_entry++;
+ if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
zswap_invalidate_entry(tree, dupentry);
+ WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
}
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
@@ -1693,8 +1619,6 @@ insert_entry:
return true;
-put_dstmem:
- mutex_unlock(&acomp_ctx->mutex);
put_pool:
zswap_pool_put(entry->pool);
freepage:
@@ -1702,38 +1626,48 @@ freepage:
reject:
if (objcg)
obj_cgroup_put(objcg);
+check_old:
+ /*
+ * If the zswap store fails or zswap is disabled, we must invalidate the
+ * possibly stale entry which was previously stored at this offset.
+ * Otherwise, writeback could overwrite the new data in the swapfile.
+ */
+ spin_lock(&tree->lock);
+ entry = zswap_rb_search(&tree->rbroot, offset);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
+ spin_unlock(&tree->lock);
return false;
shrink:
- pool = zswap_pool_last_get();
- if (pool && !queue_work(shrink_wq, &pool->shrink_work))
- zswap_pool_put(pool);
+ shrink_pool = zswap_pool_last_get();
+ if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
+ zswap_pool_put(shrink_pool);
goto reject;
}
bool zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- /* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_rb_search(&tree->rbroot, offset);
if (!entry) {
spin_unlock(&tree->lock);
return false;
}
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
if (entry->length)
- __zswap_load(entry, page);
+ zswap_decompress(entry, page);
else {
dst = kmap_local_page(page);
zswap_fill_page(dst, entry->value);
@@ -1744,67 +1678,63 @@ bool zswap_load(struct folio *folio)
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
- spin_lock(&tree->lock);
- if (zswap_exclusive_loads_enabled) {
- zswap_invalidate_entry(tree, entry);
- folio_mark_dirty(folio);
- } else if (entry->length) {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zswap_lru_add(&entry->pool->list_lru, entry);
- }
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
+
+ folio_mark_dirty(folio);
return true;
}
-void zswap_invalidate(int type, pgoff_t offset)
+void zswap_invalidate(swp_entry_t swp)
{
- struct zswap_tree *tree = zswap_trees[type];
+ pgoff_t offset = swp_offset(swp);
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
- /* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
- if (!entry) {
- /* entry was written back */
- spin_unlock(&tree->lock);
- return;
- }
- zswap_invalidate_entry(tree, entry);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}
-void zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *tree;
+ struct zswap_tree *trees, *tree;
+ unsigned int nr, i;
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+ if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
- return;
+ return -ENOMEM;
}
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
+ for (i = 0; i < nr; i++) {
+ tree = trees + i;
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ }
+
+ nr_zswap_trees[type] = nr;
+ zswap_trees[type] = trees;
+ return 0;
}
void zswap_swapoff(int type)
{
- struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;
+ struct zswap_tree *trees = zswap_trees[type];
+ unsigned int i;
- if (!tree)
+ if (!trees)
return;
- /* walk the tree and free everything */
- spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
- tree->rbroot = RB_ROOT;
- spin_unlock(&tree->lock);
- kfree(tree);
+ /* try_to_unuse() invalidated all the entries already */
+ for (i = 0; i < nr_zswap_trees[type]; i++)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
+
+ kvfree(trees);
+ nr_zswap_trees[type] = 0;
zswap_trees[type] = NULL;
}
@@ -1837,8 +1767,6 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("duplicate_entry", 0444,
- zswap_debugfs_root, &zswap_duplicate_entry);
debugfs_create_u64("pool_total_size", 0444,
zswap_debugfs_root, &zswap_pool_total_size);
debugfs_create_atomic_t("stored_pages", 0444,
@@ -1887,7 +1815,8 @@ static int zswap_setup(void)
zswap_enabled = false;
}
- shrink_wq = create_workqueue("zswap-shrink");
+ shrink_wq = alloc_workqueue("zswap-shrink",
+ WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
if (!shrink_wq)
goto fallback_fail;